In [1]:
# --- 1) Point PySpark to the exact Python you're running right now ---
import os, sys

py = sys.executable  # e.g., C:\Users\PX\anaconda3\envs\music-chatbot\python.exe
os.environ["PYSPARK_DRIVER_PYTHON"] = py
os.environ["PYSPARK_PYTHON"] = py

# --- 2) Stop any existing Spark session cleanly (if already created) ---
try:
    spark.stop()
except Exception:
    pass

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("PySpark-Windows-Fix")
    # Ensures executors pick the same Python as the driver
    .config("spark.pyspark.driver.python", py)
    .config("spark.pyspark.python", py)
    # Optional: Arrow speeds up Pandas ↔ Spark conversions if available
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)

# 🔥 PySpark Must-Know Commands (Cheat Sheet)

Use this as your daily reference + interview primer. Each snippet shows what it does and why you’d use it.

---

## 1) Start / Get SparkSession
Create or get the main entry point to Spark SQL/DataFrame API:

from pyspark.sql import SparkSession  
spark = SparkSession.builder.appName("CheatSheet").getOrCreate()

---

## 2) Read & Write Data (CSV / Parquet / JSON)
CSV: quick exploration  
Parquet: columnar, compressed, fastest in Spark  
JSON: semi-structured

df = spark.read.option("header", True).option("inferSchema", True).csv("data/file.csv")  
df.write.mode("overwrite").option("header", True).csv("out/csv/")  
dfp = spark.read.parquet("data/parquet/")  
dfp.write.mode("overwrite").parquet("out/parquet/")  
dfj = spark.read.json("data/json/")

---

## 3) Inspect Data Quickly
Peek rows, schema, columns, summary stats

df.show(10, truncate=False)  
df.printSchema()  
df.columns  
df.describe().show()

---

## 4) Select, Filter, Where
Core row/column selection and predicates

from pyspark.sql import functions as F, types as T  
df.select("colA", "colB")  
df.select(F.col("colA").alias("a"))  
df.filter(F.col("amount") > 100)  
df.where((F.col("age") >= 18) & (F.col("country") == "CH"))

---

## 5) Add / Transform Columns
Create, cast, conditional logic, rename, drop

df.withColumn("double_amt", F.col("amount") * 2)  
df.withColumn("age_int", F.col("age").cast(T.IntegerType()))  
df.withColumn("segment", F.when(F.col("amount") >= 1000, "VIP").otherwise("STD"))  
df.withColumnRenamed("old", "new")  
df.drop("unneeded_col")

---

## 6) Null Handling
Drop/fill nulls; coalesce picks first non-null

df.na.drop(subset=["age", "city"])  
df.na.fill({"age": 0, "city": "Unknown"})  
df.select(F.coalesce("nickname", "name").alias("display_name"))

---

## 7) String & Date Helpers
Common text transforms; parse and format dates/timestamps

df.select(F.trim("name"), F.lower("email").alias("email_lc"), F.concat_ws(" ", "first", "last").alias("full_name"))  
df.select(F.to_date("dt", "yyyy-MM-dd").alias("date"), F.date_add(F.col("date"), 7).alias("date_plus_7"))

---

## 8) Sort, Limit, Distinct
Presentation + deduplication

df.orderBy(F.desc("amount"))  
df.limit(100)  
df.select("user_id").distinct()  
df.dropDuplicates(["user_id", "day"])

---

## 9) GroupBy & Aggregations
Summaries by key(s); alias for readable column names

agg = (df.groupBy("country", "year")  
        .agg(F.count("*").alias("cnt"),  
             F.sum("amount").alias("sum_amt"),  
             F.avg("amount").alias("avg_amt"),  
             F.approx_count_distinct("user_id").alias("approx_users")))

---

## 10) Joins (inner/left/right/full/semi/anti)
Combine datasets; semi/anti for existence filtering

df1.join(df2, on="id", how="inner")  
df1.join(df2, df1.id == df2.user_id, "left")  
df_left_only = df1.join(df2, "id", "left_anti")  
df_semi_exist = df1.join(df2, "id", "left_semi")

---

## 11) Window Functions (analytics over partitions)
Rank, running totals, lag/lead within groups

from pyspark.sql.window import Window  
w = Window.partitionBy("country").orderBy(F.desc("amount"))  
df.select("user","country","amount",  
          F.row_number().over(w).alias("rn"),  
          F.sum("amount").over(w).alias("running_sum"))

---

## 12) Repartition / Coalesce
Control parallelism; coalesce only reduces partitions (no full shuffle)

df = df.repartition(32, "country")  
df_small = df.coalesce(4)

---

## 13) Cache / Persist / Unpersist
Keep reused results in memory/disk to speed re-use

df_cached = df.cache()  
df_cached.count()  
df_cached.unpersist()

---

## 14) SQL with Temp Views
Mix SQL with DataFrame API; handy for quick queries

df.createOrReplaceTempView("t")  
spark.sql("SELECT country, COUNT(*) cnt FROM t GROUP BY country ORDER BY cnt DESC").show()

---

## 15) Write Options (mode, partitioning)
Overwrite/append; partition files by columns for faster reads

(df.write.mode("overwrite")  
    .partitionBy("year","month")  
    .parquet("out/sales_by_ym/"))

---

## 16) Read Multiple Files / Globs
Ingest folders or patterns at once

spark.read.parquet("s3://bucket/path/2025/*/*.parquet")  
spark.read.csv(["path/a.csv","path/b.csv"], header=True)

---

## 17) Hints & Skew Helpers
Broadcast for small dimension tables; salting/repartition for skew

df_big.join(F.broadcast(df_small_dim), "key", "inner")  
df_big.repartition("key")

---

## 18) UDFs (use sparingly) & Built-ins
Prefer built-ins; UDFs break optimizations. Pandas UDFs for vectorized speed.

from pyspark.sql.functions import udf, pandas_udf  
@udf(T.IntegerType())  
def add1(x): return x + 1  
df.withColumn("x_plus1", add1("x"))

---

## 19) Explain Plan
See logical/physical plan; confirm shuffles and scans

df.explain(mode="formatted")

---

## 20) Pandas ↔ Spark Interop
Convert small results to Pandas; create Spark DF from Pandas

pdf = df.limit(10000).toPandas()  
df2 = spark.createDataFrame(pdf)

---

## 21) Sampling, Splits, Set Ops
Quick samples; train/test split; set operations

df.sample(withReplacement=False, fraction=0.1, seed=42)  
train, test = df.randomSplit([0.8, 0.2], seed=42)  
df1.unionByName(df2, allowMissingColumns=True)  
df1.intersect(df2)  
df1.subtract(df2)

---

## 22) Arrays / Maps / Explode
Work with nested/list data

df.select(F.size("items"), F.array_contains("items","foo"))  
df.select(F.explode("items").alias("item"))

---

## 23) JSON in Columns
Parse JSON strings into structs; re-serialize when needed

from pyspark.sql import types as T  
schema = T.StructType([T.StructField("a", T.IntegerType()), T.StructField("b", T.StringType())])  
df.select(F.from_json("json_str", schema).alias("obj")).select("obj.*")

---

## 24) JDBC (Databases)
Read/write via JDBC connectors (supply driver JAR)

jdbc_df = (spark.read.format("jdbc")  
    .option("url", "jdbc:postgresql://host/db")  
    .option("dbtable", "public.sales")  
    .option("user", "user").option("password", "pass")  
    .load())

---

## 25) Write Modes Recap
Control overwrite/append behavior

df.write.mode("overwrite").parquet("out/")

---

### ✅ Pro Tips
- Prefer Parquet + column pruning + predicate pushdown  
- Keep transformations narrow before wide ops (filter early)  
- Tune spark.sql.shuffle.partitions to match cluster size  
- Use broadcast joins for small dims, cache reused DataFrames


In [2]:
from pathlib import Path
import urllib.request
from pyspark.sql import functions as F

# 1) Pick a CSV to download (replace with any direct CSV URL if you want)
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data_dir = Path.cwd() / "data"
data_dir.mkdir(exist_ok=True)
local_csv = data_dir / "titanic.csv"

# 2) Download to local file (only if not already present)
if not local_csv.exists():
    print(f"Downloading to {local_csv} ...")
    urllib.request.urlretrieve(url, local_csv.as_posix())
else:
    print(f"Already exists: {local_csv}")

# 3) Read with Spark
df = spark.read.option("header", True).option("inferSchema", True).csv(local_csv.as_posix())

# 4) Inspect
df.printSchema()
df.show(5)

Already exists: c:\Users\PX\Desktop\java-python-ipynb-practice\Python\3. Pyspark\data\titanic.csv
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   

In [3]:
# Select & Filter
df_select = df.select("PassengerId", "Name", "Sex", "Age", "Pclass", "Survived")
df_filter = df_select.filter(F.col("Age") >= 18)

# GroupBy & Aggregation
agg = (
    df_filter.groupBy("Sex", "Pclass")
    .agg(
        F.count("*").alias("cnt"),
        F.avg("Age").alias("avg_age"),
        F.avg("Survived").alias("survival_rate")
    )
    .orderBy(F.desc("cnt"))
)

# Null handling
clean = df_select.na.fill({"Age": 0})

# Show results
print("\n=== Filtered adults (Age >= 18) – sample ===")
df_filter.show(5, truncate=False)

print("\n=== Aggregation by Sex, Pclass ===")
agg.show(10, truncate=False)

print("\n=== Null handling (filled Age with 0) – sample ===")
clean.show(5, truncate=False)

print(f"\nLocal CSV path used: {local_csv.as_posix()}")


=== Filtered adults (Age >= 18) – sample ===
+-----------+---------------------------------------------------+------+----+------+--------+
|PassengerId|Name                                               |Sex   |Age |Pclass|Survived|
+-----------+---------------------------------------------------+------+----+------+--------+
|1          |Braund, Mr. Owen Harris                            |male  |22.0|3     |0       |
|2          |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1     |1       |
|3          |Heikkinen, Miss. Laina                             |female|26.0|3     |1       |
|4          |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35.0|1     |1       |
|5          |Allen, Mr. William Henry                           |male  |35.0|3     |0       |
+-----------+---------------------------------------------------+------+----+------+--------+
only showing top 5 rows

=== Aggregation by Sex, Pclass ===
+------+------+---+------------------+----------