In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import Ridge

In [4]:
#load and read the training data
DRW_data_path = "/Users/garrettbrown/Desktop/CS189/DRW Competition/DRW_data/train.parquet"
data = pd.read_parquet(DRW_data_path)

#display the beginning of the data, and the shape
print(data.head())
n = data.shape[0]
d = data.shape[1] - 1
print(f"The total data set has {n} data points with {d} features, the last column is labels")

                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.181844   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.489497   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.260121   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.099976   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.270893   

                           X2        X3        X4        X5  ...      X772  \
2023-03-01 00:00:00 -0.637860  0.006652  0.136870  0.116698  ...  0.333753   
2023-03-01 00:01:00 -0.075619  0.431594  0.522400  0.475255  ...  0.333657   
2023-03-01 00:02:00 -0.444684  0.100695  0.224729  0.203282  ...  0.333667   
2023-03-01 00:03:00 -0.666728 -0.123858  0.019197  0.014459  ...  0.333174   
2023-03-01 00:04:00 -0.325973  0.116336  0.234311  0.214073  ...  0.333171   

                         X773      X774      X775      X

In [7]:
#Take only the data we want, from the preprocessing
data = data.loc[:, ["X21", "X28", "X77", "X181", "X219", "X287", "X465", "X508", "X757", "X758", "label"]]
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [9]:
#indicate how much we want to set aside for validation
valid_size = 50000

#randomize the order and remove the validation
indices = np.arange(n)
np.random.shuffle(indices)
X_total = X.iloc[indices]
y_total = y.iloc[indices]

X_valid = X_total.iloc[:50000]
y_valid = y_total.iloc[:50000]
X = X_total.iloc[50000:]
y = y_total.iloc[50000:]

In [11]:
ridge = Ridge(alpha = 1.0)
ridge.fit(X, y)

In [13]:
predictions = ridge.predict(X_valid)
corr = np.corrcoef(predictions, y_valid)[0, 1]
corr

0.12682887689030622

In [15]:
#ok so we're getting a pretty decent correlation on validation around the .13-.15 range.
#let's see if we can do any better by tuning alpha.
alphas = [10.0**i for i in range(-5, 10)]
correlations = []
for alpha in alphas:
    ridge = Ridge(alpha = alpha)
    ridge.fit(X, y)
    predictions = ridge.predict(X_valid)
    correlations.append(np.corrcoef(predictions, y_valid)[0, 1])
best_alpha = np.argmax(correlations) - 5
best_alpha = 10.0**best_alpha

In [17]:
correlations

[0.126828889346206,
 0.12682888934508488,
 0.12682888933387487,
 0.1268288892217745,
 0.12682888810076579,
 0.12682887689030622,
 0.12682876474839164,
 0.1268276396044863,
 0.12681602277382792,
 0.12666926913707885,
 0.12425564824888563,
 0.1108194958237286,
 0.09953512095746817,
 0.09746296136499304,
 0.09723829654103011]

In [19]:
#take a look at the correlation of the best one
ridge = Ridge(alpha = best_alpha)
ridge.fit(X, y)
predictions = ridge.predict(X_valid)
corr = np.corrcoef(predictions, y_valid)[0, 1]
corr

0.126828889346206

In [21]:
#Now finally we train our ridge regression model with our best_alpha on the entire training set
best_ridge = Ridge(alpha = best_alpha)
ridge.fit(X_total, y_total)

In [None]:
#load and read the training data
DRW_test_path = "/Users/garrettbrown/Desktop/CS189/DRW Competition/DRW_data/test.parquet"
test_data = pd.read_parquet(DRW_test_path)
test_data

In [None]:
test_data = test_data.loc[:, ["X21", "X28", "X77", "X181", "X219", "X287", "X465", "X508", "X757", "X758"]]
test_predictions = ridge.predict(test_data)
test_predictions

In [14]:
test_predictions = pd.DataFrame(test_predictions)
test_predictions.index = [i for i in range(1, 538151)]
test_predictions.columns = ["prediction"]
test_predictions.index.name = "ID"
test_predictions

Unnamed: 0_level_0,prediction
ID,Unnamed: 1_level_1
1,0.174318
2,0.118051
3,0.027949
4,0.025799
5,-0.076946
...,...
538146,-0.212695
538147,-0.089856
538148,-0.166418
538149,0.010741


In [15]:
test_predictions.to_csv("submission.csv")