# Getting Started: Market Research
This Jupyter notebook is a quick demonstration on how to get started on the market research section.

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
from tqdm import tqdm
from torch.utils.data import Dataset
from torcheval.metrics import R2Score
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

In [None]:
filepath = '~/Desktop/localfiles/quantchallenge-2025/research/'

## 1) Download Data
Please download the train and test data and place it within the ./research/data path. If you've placed it in the correct place, you should see the following cell work:

In [None]:
import pandas as pd

train_data = pd.read_csv(filepath+'data/train.csv')
new_cols_train = pd.read_csv(filepath+'data/train_new.csv')
train_data = pd.concat([train_data, new_cols_train], axis=1)

test_data = pd.read_csv(filepath+'data/test.csv')
new_cols_test = pd.read_csv(filepath+'data/test_new.csv')
test_data = pd.concat([test_data, new_cols_test], axis=1)


X_cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
y_cols = ['Y1', 'Y2']

print(train_data.head())
print(test_data.head())

## Emi's notes on the data
* The biggest problem with this data is the distribution of Y1 and Y2 is not normal. The goal is to maximize $R^2$, and the corresponding loss function is $\ell_2$ but this produces a normal distribution of errors between $\hat y$ and $y$. Ideally, Y1 and Y2 should have normal distributions but they do not. Maybe resampling them will help?

* You should probably standardize the column mean and std because on inspection, column A is much larger than the others. (See following cell.)

* There are NAN values in O and P that have been replaced with 0.


## Precondition the data

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

Xtrain = train_data[X_cols].to_numpy()
ytrain = train_data[y_cols].to_numpy()

scaler = StandardScaler()
scaler.fit(Xtrain)
print(scaler.mean_)


Xtest = test_data[X_cols].to_numpy()

Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

### Data loader

In [None]:
class SimpleDataset(Dataset):
    def __init__(self, X:np.array, y:np.array):
        self.N = X.shape[0]
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        print(X.shape)
        print(y.shape)

    def __getitem__(self, idx):
        return torch.Tensor(self.X[idx,:]), torch.Tensor(self.y[idx,:])

    def __len__(self):
        return self.N

## 2) Neural network

In [None]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()

        self.layer2 = nn.Linear(hidden_size, hidden_size)

        self.layer3 = nn.Linear(hidden_size, hidden_size//2)

        self.layer4 = nn.Linear(hidden_size//2, output_size)
    
        # Xavier init
        init.xavier_uniform(self.layer1.weight)
        init.xavier_uniform(self.layer2.weight)
        init.xavier_uniform(self.layer3.weight)
        init.xavier_uniform(self.layer4.weight)
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.layer4(x)
        return x

        

In [None]:
# basic training loop

def validate(model, validation_dataset, loss):
    validation_dataloader = DataLoader(validation_dataset, shuffle=False, batch_size=1024)
    losses = []
    yhats = []
    ys = []
    metric = R2Score()
    for X,y in tqdm(validation_dataloader):
        yhat = model(X)
        yhats.append(yhat.detach().numpy())
        ys.append(y.detach().numpy())
        metric.update(yhat, y)
        l = loss(yhat, y)
        losses.append(l.item())
        
    return losses, np.concatenate(yhats), np.concatenate(ys), metric

In [None]:
# training loop
model = NN(len(X_cols), len(X_cols), 2)
n_epochs = 5
losses = []
learning_rate = 0.005
best_r2 = -1.0
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.85)
loss = torch.nn.MSELoss()

validation_split = 1.0
train_size = int(np.floor(Xtrain.shape[0]*validation_split))
val_size = Xtrain.shape[0] - train_size
train_dataset = SimpleDataset(Xtrain[:train_size,:], ytrain[:train_size,:])
val_dataset = SimpleDataset(Xtrain[train_size:,:], ytrain[train_size:,:])

for epoch in range(n_epochs):
    for X,y in tqdm(DataLoader(train_dataset, shuffle=False, batch_size=48)):
        yhat = model(X)
        l = loss(yhat, y)
        losses.append(l.item())
        l.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    scheduler.step()
    print(scheduler.get_last_lr())
    
    if validation_split < 1.0:
        losses, yhats, ys, metric = validate(model, val_dataset, loss)
        print(f"Epoch {epoch}, R2: {metric.compute().item()}")


In [None]:
torch.save(model.state_dict(), "model_weights.pth")

## 3) Submit Predictions
In order to submit predictions, we need to make a CSV file with three columns: id, Y1, and Y2. In the below example, we let our predictions of Y1 and Y2 be the means of Y1 and Y2 in the train set.

In [None]:
preds = test_data[['id']]
yhat = model(torch.Tensor(Xtest)).detach().numpy()
print(Xtest.shape)
print(yhat.shape)
preds['Y1'] = yhat[:,0]
preds['Y2'] = yhat[:,1]
preds

In [None]:
# save preds to csv
preds.to_csv('preds.csv', index=False)

You should now be able to submit preds.csv to [https://quantchallenge.org/dashboard/data/upload-predictions](https://quantchallenge.org/dashboard/data/upload-predictions)! Note that you should receive a public $R^2$ score of $-0.042456$ with this set of predictions. You should try to get the highest possible $R^2$ score over the course of these next few days. Be careful of overfitting to the public score, which is only calculated on a subset of the test data—the final score that counts is the private $R^2$ score!