<a href="https://colab.research.google.com/github/cloudit101/myaiml/blob/main/spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark




In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("PySparkTutorial") \
    .getOrCreate()


In [None]:
from pyspark.sql import Row

# Sample data
data = [
    Row(name="Alice", age=25, department="HR", salary=50000),
    Row(name="Bob", age=30, department="IT", salary=70000),
    Row(name="Cathy", age=28, department="HR", salary=52000),
    Row(name="David", age=35, department="Finance", salary=80000),
    Row(name="Eva", age=40, department="IT", salary=95000)
]

# Create DataFrame
df = spark.createDataFrame(data)

# Show the data
df.show()


+-----+---+----------+------+
| name|age|department|salary|
+-----+---+----------+------+
|Alice| 25|        HR| 50000|
|  Bob| 30|        IT| 70000|
|Cathy| 28|        HR| 52000|
|David| 35|   Finance| 80000|
|  Eva| 40|        IT| 95000|
+-----+---+----------+------+



In [None]:
# Print the schema
df.printSchema()

# Describe statistics
df.describe().show()

# Select specific columns
df.select("name", "salary").show()

# Filter rows
df.filter(df.age > 30).show()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------+-----+----------------+----------+-----------------+
|summary| name|             age|department|           salary|
+-------+-----+----------------+----------+-----------------+
|  count|    5|               5|         5|                5|
|   mean| NULL|            31.6|      NULL|          69400.0|
| stddev| NULL|5.94138031100518|      NULL|19021.04098097683|
|    min|Alice|              25|   Finance|            50000|
|    max|  Eva|              40|        IT|            95000|
+-------+-----+----------------+----------+-----------------+

+-----+------+
| name|salary|
+-----+------+
|Alice| 50000|
|  Bob| 70000|
|Cathy| 52000|
|David| 80000|
|  Eva| 95000|
+-----+------+

+-----+---+----------+------+
| name|age|department|salary|
+-----+---+----------+------+
|David| 35|   Finance| 80000|
|  Eva| 40|        IT| 95000|
+-

In [None]:
# Average salary by department
df.groupBy("department").avg("salary").show()

# Count employees per department
df.groupBy("department").count().show()


+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|    51000.0|
|        IT|    82500.0|
|   Finance|    80000.0|
+----------+-----------+

+----------+-----+
|department|count|
+----------+-----+
|        HR|    2|
|        IT|    2|
|   Finance|    1|
+----------+-----+



In [None]:
# Register DataFrame as a SQL table
df.createOrReplaceTempView("employees")

# Use SQL to select data
result = spark.sql("SELECT department, COUNT(*) as count, AVG(salary) as avg_salary FROM employees GROUP BY department")
result.show()


+----------+-----+----------+
|department|count|avg_salary|
+----------+-----+----------+
|        HR|    2|   51000.0|
|        IT|    2|   82500.0|
|   Finance|    1|   80000.0|
+----------+-----+----------+



In [None]:
spark.stop()
