In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, avg, count, when
from pyspark.sql.window import Window

# Create SparkSession
spark = SparkSession.builder \
    .appName("SparkTutorial") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"Spark Version: {spark.version}")
print("SparkSession ready!")

## 1. Creating DataFrames

In [None]:
# Create sample data
data = [
    (1, "Alice", "Engineering", 75000),
    (2, "Bob", "Marketing", 65000),
    (3, "Charlie", "Engineering", 80000),
    (4, "Diana", "Sales", 70000),
    (5, "Eve", "Engineering", 72000)
]

columns = ["id", "name", "department", "salary"]

employees = spark.createDataFrame(data, columns)
employees.show()

In [None]:
# Check schema
employees.printSchema()

## 2. Basic Transformations

In [None]:
# Select columns
employees.select("name", "salary").show()

In [None]:
# Filter rows
engineers = employees.filter(col("department") == "Engineering")
engineers.show()

In [None]:
# Add computed column
with_bonus = employees.withColumn("bonus", col("salary") * 0.10)
with_bonus.show()

## 3. Aggregations

In [None]:
# Group by department
dept_stats = employees.groupBy("department").agg(
    count("*").alias("count"),
    avg("salary").alias("avg_salary"),
    spark_sum("salary").alias("total_salary")
)
dept_stats.show()

## 4. Spark SQL

In [None]:
# Register temp view
employees.createOrReplaceTempView("employees")

# Run SQL query
result = spark.sql("""
    SELECT department, COUNT(*) as cnt, AVG(salary) as avg_sal
    FROM employees
    GROUP BY department
    ORDER BY avg_sal DESC
""")
result.show()

## 5. Window Functions

In [None]:
from pyspark.sql.functions import row_number, rank

# Rank employees by salary within department
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

ranked = employees.withColumn("rank", rank().over(window_spec))
ranked.show()

## 6. Cleanup

In [None]:
# Stop SparkSession when done
spark.stop()
print("SparkSession stopped")