# PySpark pandas API (formerly Koalas)
Pandas-like interface for big data with Spark

In [None]:
from spark_config import get_spark_session

# Initialize Spark session
spark = get_spark_session(app_name="PandasAPIDemo")

In [None]:
# Import pandas API on Spark
import pyspark.pandas as ps
import pandas as pd

# Set default index type for better performance
ps.set_option('compute.default_index_type', 'distributed')

In [None]:
# Create DataFrame - same syntax as pandas
psdf = ps.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [34, 45, 29, 52, 38],
    'city': ['Moscow', 'SPb', 'Moscow', 'SPb', 'Moscow'],
    'salary': [100000, 120000, 90000, 150000, 110000]
})

psdf.head()

In [None]:
# Pandas-like operations - all work on distributed data

# Filtering
adults = psdf[psdf['age'] > 35]
print("People over 35:")
print(adults)

In [None]:
# GroupBy - works on cluster
city_stats = psdf.groupby('city').agg({
    'salary': ['mean', 'sum', 'count'],
    'age': 'mean'
})
print("Stats by city:")
print(city_stats)

In [None]:
# Read from S3
# psdf = ps.read_csv("s3a://bucket/data.csv")
# psdf = ps.read_parquet("s3a://bucket/data.parquet")

In [None]:
# Convert to regular Spark DataFrame if needed
spark_df = psdf.to_spark()
spark_df.printSchema()

In [None]:
# Convert to regular pandas (be careful with large data)
pandas_df = psdf.to_pandas()
print(type(pandas_df))

In [None]:
spark.stop()