# Basic PySpark examples

## 1. Initialize SparkSession

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("BasicExample") \
    .getOrCreate()

## 2. Create a DataFrame from a list

In [3]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
columns = ["Name", "Age"]

In [4]:
df = spark.createDataFrame(data, columns)
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 28|
+-------+---+



## 3. Read a CSV file

In [16]:
df = spark.read.csv("file.csv", header=True, inferSchema=True)
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 30|
|    Bob| 25|
|Charlie| 35|
|  Diana| 22|
|    Eve| 28|
+-------+---+



## 4. Select columns and filter rows

In [8]:
df.select("Name", "Age").filter(df.Age > 26).show()


+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 30|
|Charlie| 35|
|    Eve| 28|
+-------+---+



## 5. Add a new column

In [17]:
from pyspark.sql.functions import col

df = df.withColumn("AgePlusTen", col("Age") + 10)
df.show()

+-------+---+----------+
|   Name|Age|AgePlusTen|
+-------+---+----------+
|  Alice| 30|        40|
|    Bob| 25|        35|
|Charlie| 35|        45|
|  Diana| 22|        32|
|    Eve| 28|        38|
+-------+---+----------+



## 6. Group by and aggregate

In [10]:
df.groupBy("Age").count().show()

+---+-----+
|Age|count|
+---+-----+
| 28|    1|
| 22|    1|
| 35|    1|
| 25|    1|
| 30|    1|
+---+-----+



## 7. Write DataFrame to Parquet

In [18]:
df.write.parquet("my_parquet_file.parquet", mode="overwrite")

## 8. Stop the SparkSession

In [19]:
spark.stop()