# 02 — PySpark Data Exploration
Same exploration as notebook 01, but in PySpark.
Goal: learn PySpark syntax by comparing it to the pandas version you already know.

## Starting a Spark Session
In pandas, you just import pandas. In PySpark, you first create a **SparkSession** — the entry point to everything Spark does. Think of it as turning on the engine.

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName('CustomerRecommendationEngine')
    .master('local[*]')
    .getOrCreate()
)

# local[*] means: run Spark locally, using all available CPU cores
print(f'Spark version: {spark.version}')
print('Spark session created.')

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

## Loading CSVs
Pandas: pd.read_csv(file)

PySpark: spark.read.csv(file, header=True, inferSchema=True)

inferSchema=True tells Spark to guess column types instead of treating everything as strings.

In [None]:
RAW = '../data/raw/'

customers = spark.read.csv(RAW + 'olist_customers_dataset.csv', header=True, inferSchema=True)
orders = spark.read.csv(RAW + 'olist_orders_dataset.csv', header=True, inferSchema=True)
items = spark.read.csv(RAW + 'olist_order_items_dataset.csv', header=True, inferSchema=True)
payments = spark.read.csv(RAW + 'olist_order_payments_dataset.csv', header=True, inferSchema=True)
reviews = spark.read.csv(RAW + 'olist_order_reviews_dataset.csv', header=True, inferSchema=True)
products = spark.read.csv(RAW + 'olist_products_dataset.csv', header=True, inferSchema=True)
sellers = spark.read.csv(RAW + 'olist_sellers_dataset.csv', header=True, inferSchema=True)
categories = spark.read.csv(RAW + 'product_category_name_translation.csv', header=True, inferSchema=True)

print('All files loaded into Spark DataFrames.')

## Inspecting a DataFrame
Pandas: df.head(), df.shape, df.dtypes

PySpark: df.show(), df.count() + len(df.columns), df.printSchema()

In [None]:
orders.show(5)
orders.printSchema()

## Table Shapes and Missing Values
Pandas: df.isnull().sum()

PySpark: No built-in equivalent. You count nulls per column with F.count(F.when(...))

PySpark is more verbose here. The tradeoff: it scales to billions of rows.

In [None]:
from pyspark.sql import functions as F

tables = {
    'customers': customers, 'orders': orders, 'items': items,
    'payments': payments, 'reviews': reviews, 'products': products,
    'sellers': sellers, 'categories': categories
}

for name, df in tables.items():
    row_count = df.count()
    col_count = len(df.columns)
    null_count = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]
    total_nulls = sum(null_count)
    pct_missing = (total_nulls / (row_count * col_count)) * 100
    print(f'{name:12s}  {row_count:>7,} rows x {col_count:>2} cols  |  {pct_missing:.1f}% missing')

## Key Counts and Date Range
Pandas: df[col].nunique()

PySpark: df.select(F.countDistinct(col)).collect()[0][0]

Notice: PySpark does not return values directly. You .collect() to pull results back to your machine.

In [None]:
n_customers = customers.select(F.countDistinct('customer_unique_id')).collect()[0][0]
n_orders = orders.select(F.countDistinct('order_id')).collect()[0][0]
n_products = products.select(F.countDistinct('product_id')).collect()[0][0]
n_sellers = sellers.select(F.countDistinct('seller_id')).collect()[0][0]

date_range = orders.select(F.min('order_purchase_timestamp'), F.max('order_purchase_timestamp')).collect()[0]

print(f'Unique customers:  {n_customers:,}')
print(f'Unique orders:     {n_orders:,}')
print(f'Unique products:   {n_products:,}')
print(f'Unique sellers:    {n_sellers:,}')
print(f'Order date range:  {date_range[0]} to {date_range[1]}')

## Review Score Distribution
Pandas: df[col].value_counts()

PySpark: df.groupBy(col).count().orderBy(col)

In [None]:
reviews.groupBy('review_score').count().orderBy('review_score').show()

total = reviews.count()
has_text = reviews.filter(F.col('review_comment_message').isNotNull()).count()
print(f'Reviews with text: {has_text:,} / {total:,} ({has_text/total*100:.0f}%)')

## Joining Tables
Pandas: pd.merge(df1, df2, on=key)

PySpark: df1.join(df2, on=key, how=left)

In [None]:
product_cats = products.join(categories, on='product_category_name', how='left')

print('Top 10 Product Categories:')
(product_cats
    .groupBy('product_category_name_english')
    .count()
    .orderBy(F.desc('count'))
    .show(10, truncate=False))

print('Payment Methods:')
(payments
    .groupBy('payment_type')
    .count()
    .orderBy(F.desc('count'))
    .show())

## Order Value Summary
Pandas: df.groupby(col)[val].sum().describe()

PySpark: df.groupBy(col).agg(F.sum(val)) then .summary()

In [None]:
order_values = items.groupBy('order_id').agg(F.sum('price').alias('order_value'))
order_values.summary().show()

## Stop the Spark Session
Always stop Spark when done — it frees up resources.

In [None]:
spark.stop()
print('Spark session stopped.')