# Scaling Pandas Workflows with PySpark's Pandas API

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import pandas as pd

pandas_df = pd.DataFrame({"value": [1, 2, 3, 4, 5]})
print(pandas_df["value"].mean())

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder.getOrCreate()

spark_df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["value"])
spark_df.select(avg("value")).show()

## Basic Operations Comparison

In [None]:
import pyspark.pandas as ps

ps_s = ps.Series([1, 3, 5, 6, 8])

In [None]:
import numpy as np

ps_df = pd.DataFrame(
    {"id": np.arange(1, 1_000_001), "value": np.random.randn(1_000_000)}
)

In [None]:
ps_df_from_pandas = ps.from_pandas(pandas_df)

In [None]:
ps_df.describe()

## Data Exploration and Analysis

In [None]:
# Display the summary of the DataFrame
ps_df.info()


In [None]:
ps_df.head()

In [None]:
# Filter rows and drop any NaN values
filtered_df = ps_df.where(ps_df.value > 0).dropna()
filtered_df.head()


In [None]:
ps_df_2 = ps.DataFrame(
    {"category": ["A", "B", "A", "C", "B"], "value": [10, 20, 15, 30, 25]}
)

## Groupby Operations

In [None]:
ps_df_2.groupby("category").value.mean()

In [None]:
ps_df["value"].plot.hist()

## Visualization

In [None]:
ps_df_2.plot.bar(x="category", y="value")

In [None]:
ps_df.to_csv("output_data.csv", index=False)
ps.read_csv("output_data.csv").head()

## File I/O Operations

In [None]:
ps_df.to_parquet("output_data.parquet")
ps.read_parquet("output_data.parquet").head()

In [None]:
from sklearn.linear_model import LinearRegression

# Create a large Pandas API on Spark DataFrame
large_pdf_df = ps.DataFrame(
    {
        "feature1": range(1_000_000),
        "feature2": range(1_000_000, 2_000_000),
        "target": range(500_000, 1_500_000),
    }
)
print(f"Length of the original DataFrame: {len(large_pdf_df):,}")

# Aggregate the data to a smaller size
aggregated = large_pdf_df.groupby(large_pdf_df.feature1 // 10000).mean()
print(f"Length of the aggregated DataFrame: {len(aggregated):,}")

# Convert to pandas DataFrame
small_pdf = aggregated.to_pandas()

# Train a scikit-learn model
model = LinearRegression()
X = small_pdf[["feature1", "feature2"]]
y = small_pdf["target"]
model.fit(X, y)


## Integration with Scikit-learn

In [None]:
pandas_df["value"] = pandas_df["value"] + 1  # Operation executes immediately
print(pandas_df)

## Lazy vs Eager Evaluation

In [None]:
# Using Pandas API on Spark
updated_psdf = ps_df.assign(a=ps_df["value"] + 1)  # Lazy operation
print(updated_psdf.head())  # Triggers actual computation

In [None]:
from pyspark.sql.functions import col

pyspark_df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["col1", "col2"])
pyspark_df.select((col("col1") + col("col2")).alias("sum")).show()

## Advanced Operations

In [None]:
pandas_spark_df = ps.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
(pandas_spark_df["col1"] + pandas_spark_df["col2"]).head()

In [None]:
# Convert Pandas API on Spark DataFrame to PySpark DataFrame
spark_native_df = pandas_spark_df.to_spark()

# Now you can use full PySpark functionality
spark_native_df.select((col("col1") + col("col2")).alias("sum")).show()