In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
import sys
import numpy as np
from sklearn import model_selection
import random
import torch
import torch.nn as nn
import tensorflow as tf




In [3]:
sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [17]:
from Data.data_cleaner import cleaner
from Didi.ml_pipeline_final import validation, query
from ml_pipeline import clean_split

Data

In [18]:
## Clean data
train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [21]:
# Oversample X_train_feat
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
X_train, y_train = ros.fit_resample(
        X_train, y_train)

In [22]:
def one_hot(column, df, suffix=''):
    """
    one-hot encodes this shit
    """
    cats = pd.unique(df[column])

    for cat in cats:
        new_col = cat+suffix
        df[new_col] = df[column]==cat
        df[new_col] = df[new_col].astype('int')
    
    df = df.drop(columns=column)
    return df

In [23]:
# one-hot encode brain areas for all
X_train = one_hot('pre_brain_area', X_train, '_pre')
X_train = one_hot('post_brain_area', X_train, '_post')

X_val = one_hot('pre_brain_area', X_val, '_pre')
X_val = one_hot('post_brain_area', X_val, '_post')

X_query = one_hot('pre_brain_area', X_query, '_pre')
X_query = one_hot('post_brain_area', X_query, '_post')

In [24]:
# encode brain areas
area1 = ["basal", "soma"]
area2 = ["axon", "apical", "oblique", "apical_shaft"]
area3 = ["apical_tuft"]

def area_cols(df):
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    df.drop(columns='compartment')
    return df

In [25]:
X_train = area_cols(X_train)
X_val = area_cols(X_val)
X_query = area_cols(X_query)

In [26]:
X_train = X_train.select_dtypes('number')
X_val = X_val.select_dtypes('number')
X_query = X_query.select_dtypes('number')

Selecting Features

In [32]:
top_features = ['adp_dist',
 'post_skeletal_distance_to_soma',
 'pre_skeletal_distance_to_soma',
 'post_test_score',
 'fw_similarity',
 'nuclei_adp_dist',
 'V1_post',
 'AL_pre',
 'fw_similarity']

In [45]:
X_train_min = np.array(X_train[top_features])
X_val_min = np.array(X_val[top_features])
X_query_min = np.array(X_query[top_features])

y_train = np.array(y_train)
y_val = np.array(y_val)
y_query = np.array(y_query)

In [44]:
X_val_min.shape

(39364, 9)

In [43]:
y_val.shape

(39364,)

In [46]:
X_train_tensor = torch.tensor(X_train_min).clone().to(torch.float32)
y_train_tensor = torch.tensor(y_train).clone().to(torch.long)


In [49]:
X_valid_tensor = torch.tensor(X_val_min).clone().to(torch.float32)
y_valid_tensor = torch.tensor(y_val).clone().to(torch.long)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [51]:
#Creating neural network
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes, activation=nn.ReLU):
        super(MLP, self).__init__()
        dimensions = [input_size] + hidden_sizes + [num_classes]
        self.linears = nn.ModuleList([nn.Linear(dimensions[i], dimensions[i+1])
                                      for i in range(len(dimensions) - 1)])
        self.activation = activation()
    
    def forward(self, x):
        for layer in self.linears[:-1]:
            x = self.activation(layer(x))
        out = self.linears[-1](x)
        return out
    
# Functions to train and test the model
def train_model(model, X, y, X_test, y_test, loss_fn, optimizer, num_epochs):
    train_loss, test_loss = [], []
    for epoch in range(num_epochs):
        # Forward pass
        outputs = model(X)
        loss = loss_fn(outputs, y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Save losses
        with torch.no_grad():
            train_loss.append(loss.item())
            test_loss.append(loss_fn(model(X_test), y_test).item())
    return train_loss, test_loss

def evaluate_model(model, X, y):
    with torch.no_grad():
        outputs = model(X)
        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == y).sum().item()
        return correct / y.size(0), predicted

# Hyperparameters
num_epochs = 10000
learning_rate = 0.001
loss_fn = nn.CrossEntropyLoss()

Training NN

In [48]:
input_size = 9
num_classes = 2

In [52]:
h = [4, 16, 32, 64]
train_losses = dict()
valid_losses = dict()
accuracies = dict()

for n in [2,3,4]:
    for i in range(len(h)):
        model = MLP(input_size, [h[i]]*n, num_classes)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        train_loss, valid_loss = train_model(model, X_train_tensor, y_train_tensor, 
                                                X_valid_tensor, y_valid_tensor, 
                                            loss_fn, optimizer, num_epochs)
        train_losses[(n, h[i])] = train_loss
        valid_losses[(n, h[i])] = valid_loss
        accuracies[(n, h[i])] = evaluate_model(model, X_valid_tensor, y_valid_tensor)
        print(f"The accuracy is {accuracies[(n, h[i])]} with {h[i]} neurons in the first layer and {n} layers")
accuracies_df = pd.DataFrame([(x[0], x[1], y) for x, y in accuracies.items()], columns=['x', 'y', 'value'])
       

The accuracy is 0.6302459099684992 with 4 neurons in the first layer and 2 layers
The accuracy is 0.7540138197337669 with 16 neurons in the first layer and 2 layers
The accuracy is 0.8574839955289096 with 32 neurons in the first layer and 2 layers
The accuracy is 0.9328320292653186 with 64 neurons in the first layer and 2 layers
The accuracy is 0.6456152829996952 with 4 neurons in the first layer and 3 layers
The accuracy is 0.7771567930088406 with 16 neurons in the first layer and 3 layers
The accuracy is 0.7998933035260645 with 32 neurons in the first layer and 3 layers
The accuracy is 0.9477441316939336 with 64 neurons in the first layer and 3 layers
The accuracy is 0.6662940758053043 with 4 neurons in the first layer and 4 layers
The accuracy is 0.7502794431460218 with 16 neurons in the first layer and 4 layers
The accuracy is 0.7750736713748603 with 32 neurons in the first layer and 4 layers
The accuracy is 0.949547810181892 with 64 neurons in the first layer and 4 layers


In [53]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data = area_cols(sub_data)
sub_data = one_hot('pre_brain_area', sub_data, '_pre')
sub_data = one_hot('post_brain_area', sub_data, '_post')
sub_data = sub_data[top_features]

In [57]:
sub_data = np.array(sub_data)

In [58]:
sub_data_tensor = torch.tensor(sub_data).clone().to(torch.float32)

In [None]:
model = MLP(input_size, [64,64], num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_loss, valid_loss = train_model(model, X_train_tensor, y_train_tensor, 
                                        X_valid_tensor, y_valid_tensor, 
                                    loss_fn, optimizer, num_epochs)
with torch.no_grad():
    outputs = model(sub_data_tensor)
    _, predicted = torch.max(outputs.data, 1)


Creating Submission

In [64]:
sub_data_full = cleaner(leaderboard_path, feature_path, morph_path, submission = True)

In [73]:
sub_data_full["connected"] = np.array(predicted).astype("bool")

In [74]:
submission_data = sub_data_full.filter(['ID','connected'])

In [75]:
submission_data.to_csv('submission_data.csv',index=False)