In [57]:
import polars as pl
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
from xgboost import XGBRegressor
import gc
import time
import sys
from tqdm import tqdm  # For progress bar

In [58]:
# Step 1: Read Data with Lags
def read_data_with_lags(current_file_path, previous_file_path, columns_needed, lag_column="responder_6", lag_name="responder_6_lag_1", length = 5000):
    """Read the current file and add lagged data from the previous file."""
    # Read current data
    current_file = pl.scan_parquet(current_file_path).select(columns_needed).head(length).collect()
    
    # If there is a previous file, add lagged data
    if previous_file_path:
        previous_file = pl.scan_parquet(previous_file_path).select(["time_id", lag_column]).head(length).collect()
        lagged_data = previous_file.rename({lag_column: lag_name})
        current_file = current_file.join(lagged_data, on="time_id", how="left")
    
    return current_file

# Step 2: Clean Data
def clean_data(df):
    """Clean the data by handling missing values."""
    if isinstance(df, pl.DataFrame):
        df = df.fill_null(strategy="backward").fill_null(strategy="forward").fill_null(0)
    else:
        raise TypeError("The input is not a Polars DataFrame.")
    return df

# Placeholder function for anonymizing data
def anonomyize(df):
    pass

# Step 3: Preprocess Data
def preprocess_data(df, target_column='responder_6', n_components=0.95):
    """Apply PCA to reduce dimensions."""
    # Drop the target column and convert to numpy
    X = df.drop(target_column).to_numpy()
    y = df[target_column].to_numpy()

    # Apply PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, y

# Step 4: Anonymize
def anonymize(file): 
    return file

XGB with PCA

In [59]:
# Step 5: Train Model
def train_model(X, y):
    """Train the model using XGBoost."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae}")
    return model


Call Functions 

In [60]:

# Jupyter Notebook - File Paths and Data Preparation
file_paths = [
    f"/Users/elireadnour/Documents/Computer/Classes/machineLearning/MLFinal/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet" for i in range(10)
]
columns_needed = [
    'date_id', 'time_id', 'symbol_id', 'weight', 'feature_00', 'feature_01',
    'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06',
    'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11',
    'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
    'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
    'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
    'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
    'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
    'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
    'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
    'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51',
    'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56',
    'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61',
    'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66',
    'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71',
    'feature_72', 'feature_73', 'feature_74', 'feature_75', 'feature_76',
    'feature_77', 'feature_78', 'responder_6'
]

# Process each dataset iteratively without storing all in memory
previous_file_path = None
for i, current_file_path in enumerate(tqdm(file_paths, desc="Processing Files")):
    start_time = time.time()
    df = read_data_with_lags(current_file_path, previous_file_path, columns_needed)
    df = clean_data(df)
    #df = anonomyize(df)
    X, y = preprocess_data(df, target_column='responder_6', n_components=0.95)
    model = train_model(X, y)
    
    # Clear memory
    del df, X, y
    gc.collect()  # Force garbage collection
    
    end_time = time.time()
    print(f"Finished processing file {i+1}/{len(file_paths)} in {end_time - start_time:.2f} seconds\n")
    
    # Update previous file path
    previous_file_path = current_file_path


Processing Files:  10%|█         | 1/10 [00:00<00:04,  2.07it/s]

Mean Absolute Error (MAE): 0.18809212744235992
Finished processing file 1/10 in 0.48 seconds



Processing Files:  20%|██        | 2/10 [00:00<00:03,  2.02it/s]

Mean Absolute Error (MAE): 0.15672925114631653
Finished processing file 2/10 in 0.50 seconds



Processing Files:  30%|███       | 3/10 [00:01<00:03,  1.86it/s]

Mean Absolute Error (MAE): 0.1739712655544281
Finished processing file 3/10 in 0.59 seconds



Processing Files:  40%|████      | 4/10 [00:02<00:03,  1.81it/s]

Mean Absolute Error (MAE): 0.1807750016450882
Finished processing file 4/10 in 0.57 seconds



Processing Files:  50%|█████     | 5/10 [00:02<00:02,  1.72it/s]

Mean Absolute Error (MAE): 0.230383962392807
Finished processing file 5/10 in 0.63 seconds



Processing Files:  60%|██████    | 6/10 [00:03<00:02,  1.71it/s]

Mean Absolute Error (MAE): 0.11643332988023758
Finished processing file 6/10 in 0.60 seconds



Processing Files:  70%|███████   | 7/10 [00:03<00:01,  1.69it/s]

Mean Absolute Error (MAE): 0.20810644328594208
Finished processing file 7/10 in 0.61 seconds



Processing Files:  80%|████████  | 8/10 [00:04<00:01,  1.62it/s]

Mean Absolute Error (MAE): 0.2530061602592468
Finished processing file 8/10 in 0.67 seconds



Processing Files:  90%|█████████ | 9/10 [00:05<00:00,  1.54it/s]

Mean Absolute Error (MAE): 0.17746970057487488
Finished processing file 9/10 in 0.72 seconds



Processing Files: 100%|██████████| 10/10 [00:05<00:00,  1.68it/s]

Mean Absolute Error (MAE): 0.14538277685642242
Finished processing file 10/10 in 0.60 seconds






In [61]:
'''TO-do
- Function to load datasets --> Use 
- Function to clean datasets
- Function to train XGB 
- Function to train SVM
    - Reduce data with PCA
- Display data


'''

'TO-do\n- Function to load datasets --> Use \n- Function to clean datasets\n- Function to train XGB \n- Function to train SVM\n    - Reduce data with PCA\n- Display data\n\n\n'

XGBoost

In [62]:
# git add .
# git commit -m "commit message"
# git push origin main