In [None]:
!pip install --upgrade polars
!pip install --upgrade pandas

In [None]:
!pip install pandasql

In [None]:
import pandas as pd
import numpy as np
import time
import math

# Generate a large Pandas DataFrame
num_rows = 250_000_000  
df = pd.DataFrame({
    'value': np.random.rand(num_rows) * 100
})

def python_udf(value: float) -> float:
    """Python UDF that operates on a single float value."""
    return value * 1.1 + math.sin(value)

# Define a Python UDF for Pandas apply
def pandas_udf(series: pd.Series) -> pd.Series:
    return series * 1.1 + np.sin(series)

# Measure time for Pandas apply with Python UDF
start_time = time.time()
df['apply_result'] = df['value'].apply(python_udf)
apply_time = time.time() - start_time
print(f"Time taken for Pandas apply with Python UDF: {apply_time:.2f} seconds")

# Measure time for Pandas UDF
start_time = time.time()
df['pandas_udf_result'] = pandas_udf(df['value'])
pandas_udf_time = time.time() - start_time
print(f"Time taken for Pandas UDF: {pandas_udf_time:.2f} seconds")

# Compare the results
# print(f"Pandas apply with Python UDF is {apply_time / pandas_udf_time:.2f} times slower than Pandas UDF.")


In [2]:
import polars as pl
import numpy as np
import time

# Generate sample data (10M rows)
np.random.seed(42)
num_rows = 500_000_000 
polars_df = pl.DataFrame({
    'value': np.random.rand(num_rows) * 100
})

# Measure time for Polars expression (vectorized computation)
start_time = time.time()
df_polars_expr = polars_df.with_columns(
    (pl.col("value") * 1.1 + pl.col("value").sin()).alias("pandas_udf_result")
)
polars_expr_time = time.time() - start_time
print(f"Time taken for Polars expression: {polars_expr_time:.2f} seconds")


Time taken for Polars expression: 14.10 seconds


In [None]:
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq
import time
import math
from typing import Callable

# Generate sample data (10M rows)
np.random.seed(42)
N_ROWS = 100_000_000
num_rows = 100_000_000  # 100 million rows
df = pl.DataFrame({
    'value': np.random.rand(num_rows) * 100
})

def create_test_data():
    data = {
        'val1': np.random.normal(100, 15, N_ROWS),
        'val2': np.random.normal(50, 10, N_ROWS),
    }
    
    # Create pandas DataFrame and save as Arrow
    df_pd = pd.DataFrame(data)
    table = pa.Table.from_pandas(df_pd)
    pq.write_table(table, "large_dataset.parquet")
    
    return "large_dataset.parquet"

def complex_calculation(x: float, y: float) -> float:
    """Computationally intensive calculation"""
    return math.exp(-(x-y)**2/100) * math.sin(x/10) * math.cos(y/5)

def pd_complex_calculation(x: pd.Series, y: pd.Series) -> pd.Series:
    """Computationally intensive calculation"""
    return np.exp(-(x - y)**2 / 100) * np.sin(x / 10) * np.cos(y / 5)

# 1. Standard Pandas Implementation
def pandas_transform(file_path: str) -> pd.DataFrame:
    # Pyarrow backend if much slower for some reason
    # df = pd.read_parquet(file_path, dtype_backend="pyarrow")
    df = pd.read_parquet(file_path)
    
    start_time = time.time()
    
    # Complex transformations
    # df['complex_val'] = df.apply(
    #     lambda row: complex_calculation(row['val1'], row['val2']), 
    #     axis=1
    # )
    df['complex_val'] = pd_complex_calculation(df['val1'], df['val2'])
        
    execution_time = time.time() - start_time
    print(f"Standard Pandas execution time: {execution_time:.2f} seconds")
    
    return df

# 2. Polars Implementation
def polars_transform(file_path: str) -> pl.DataFrame:
    start_time = time.time()
    
    df = pl.scan_parquet(file_path)
    
    result = (
        df.with_columns([
            # Complex calculation
            pl.struct(['val1', 'val2'])
            .map_elements(lambda x: complex_calculation(x['val1'], x['val2']))
            .alias('complex_val'),            
        ])
        .collect()
    )
    
    execution_time = time.time() - start_time
    print(f"Polars execution time: {execution_time:.2f} seconds")
    
    return result

file_path = create_test_data()

# Run implementations
# print("Running standard Pandas implementation...")
# pandas_result = pandas_transform(file_path)

print("\nRunning Polars implementation...")
polars_result = polars_transform(file_path)

# Verify results match
# pd_sample = pandas_result.head(1000)
pl_sample = polars_result.head(1000).to_pandas()

print("\nVerifying results...")
# print(f"Pandas vs Polars max diff: {(pd_sample['complex_val'] - pl_sample['complex_val']).abs().max()}")


In [None]:
import pandas as pd
print(pd.__version__)