In [4]:
import polars as pl
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import gc

# Step 1: Read Data with Lags
def read_data_with_lags(current_file_path, previous_file_path, columns_needed, lag_column="responder_6", lag_name="responder_6_lag_1"):
    """Read the current file and add lagged data from the previous file."""
    # Read current data
    current_file = pl.scan_parquet(current_file_path).select(columns_needed)
    
    # If there is a previous file, add lagged data
    if previous_file_path:
        previous_file = pl.scan_parquet(previous_file_path).select(["time_id", lag_column])
        lagged_data = previous_file.rename({lag_column: lag_name})
        current_file = current_file.join(lagged_data, on="time_id", how="left")
    
    # Collect to DataFrame
    current_file = current_file.fetch(10000)
    return current_file

# Step 2: Clean Data
def clean_data(df):
    """Clean the data by handling missing values."""
    df = df.fill_null(strategy="backward").fill_null(strategy="forward").fill_null(0)
    return df

# Placeholder function for anonymizing data
def anonomyize(df):
    pass

# Step 3: Preprocess Data
def preprocess_data(df, target_column='responder_6', n_components=0.95):
    """Apply PCA to reduce dimensions."""
    # Drop the target column and convert to numpy
    X = df.drop(target_column).to_numpy()
    y = df[target_column].to_numpy()

    # Apply PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, y

# Step 4: Train Model
def train_model(X, y):
    """Train the model using XGBoost."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae}")
    return model

# Jupyter Notebook - File Paths and Data Preparation
file_paths = [
    f"/Users/elireadnour/Documents/Computer/Classes/machineLearning/MLFinal/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet" for i in range(10)
]
columns_needed = [
    'date_id', 'time_id', 'symbol_id', 'weight', 'feature_00', 'feature_01',
    'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06',
    'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11',
    'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
    'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
    'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
    'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
    'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
    'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
    'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
    'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51',
    'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56',
    'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61',
    'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66',
    'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71',
    'feature_72', 'feature_73', 'feature_74', 'feature_75', 'feature_76',
    'feature_77', 'feature_78', 'responder_6'
]

# Process each dataset iteratively without storing all in memory
previous_file_path = None
for current_file_path in file_paths:
    df = read_data_with_lags(current_file_path, previous_file_path, columns_needed)
    df = clean_data(df)
    X, y = preprocess_data(df, target_column='responder_6', n_components=0.95)
    model = train_model(X, y)
    
    # Clear memory
    del df, X, y
    gc.collect()  # Force garbage collection
    
    # Update previous file path
    previous_file_path = current_file_path


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.34030935168266296


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.2070946991443634


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.2338617593050003


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.2221541553735733


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.2914510667324066


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.14291422069072723


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.26261773705482483


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.29912832379341125


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.2049705982208252


  current_file = current_file.fetch(10000)


Mean Absolute Error (MAE): 0.16588710248470306
