In [2]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.cluster import SpectralClustering
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
#from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

sys.path.insert(0, '/Users/gracewang/Documents/GitHub/elecfinal')
sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')
from ml_pipeline import train_n_predict, validation, clean_split
from Data.data_cleaner import cleaner

In [3]:
## Clean data
train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [4]:
# Oversample X_train_feat
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
X_train, y_train = ros.fit_resample(
        X_train, y_train)

In [5]:
X_train_feat = X_train.copy()
X_val_feat = X_val.copy()
X_query_feat = X_query.copy()

In [6]:
def one_hot(column, df, suffix=''):
    """
    one-hot encodes this shit
    """
    cats = pd.unique(df[column])

    for cat in cats:
        new_col = cat+suffix
        df[new_col] = df[column]==cat
        df[new_col] = df[new_col].astype('int')
    
    df = df.drop(columns=column)
    return df

In [7]:
# one-hot encode brain areas for all
X_train_feat = one_hot('pre_brain_area', X_train_feat, '_pre')
X_train_feat = one_hot('post_brain_area', X_train_feat, '_post')

X_val_feat = one_hot('pre_brain_area', X_val_feat, '_pre')
X_val_feat = one_hot('post_brain_area', X_val_feat, '_post')

X_query_feat = one_hot('pre_brain_area', X_query_feat, '_pre')
X_query_feat = one_hot('post_brain_area', X_query_feat, '_post')

In [8]:
# encode brain areas
area1 = ["basal", "soma"]
area2 = ["axon", "apical", "oblique", "apical_shaft"]
area3 = ["apical_tuft"]

def area_cols(df):
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    df.drop(columns='compartment')
    return df

In [9]:
X_train_feat = area_cols(X_train_feat)
X_val_feat = area_cols(X_val_feat)
X_query_feat = area_cols(X_query_feat)

In [10]:
X_train_feat = X_train_feat.select_dtypes('number')
X_val_feat = X_val_feat.select_dtypes('number')
X_query_feat = X_query_feat.select_dtypes('number')

In [11]:
X_train_feat.dtypes

ID                                  int64
axonal_coor_x                     float64
axonal_coor_y                     float64
axonal_coor_z                     float64
dendritic_coor_x                  float64
dendritic_coor_y                  float64
dendritic_coor_z                  float64
adp_dist                          float64
post_skeletal_distance_to_soma    float64
pre_skeletal_distance_to_soma     float64
pre_oracle                        float64
pre_test_score                    float64
pre_rf_x                          float64
pre_rf_y                          float64
post_oracle                       float64
post_test_score                   float64
post_rf_x                         float64
post_rf_y                         float64
pre_nucleus_x                     float64
pre_nucleus_y                     float64
pre_nucleus_z                     float64
post_nucleus_x                    float64
post_nucleus_y                    float64
post_nucleus_z                    

Making submission

In [12]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data = area_cols(sub_data)
sub_data = one_hot('pre_brain_area', sub_data, '_pre')
sub_data = one_hot('post_brain_area', sub_data, '_post')
sub_data = sub_data.select_dtypes('number')

In [13]:
sub_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              42593 non-null  int64  
 1   axonal_coor_x                   42593 non-null  float64
 2   axonal_coor_y                   42593 non-null  float64
 3   axonal_coor_z                   42593 non-null  float64
 4   dendritic_coor_x                42593 non-null  float64
 5   dendritic_coor_y                42593 non-null  float64
 6   dendritic_coor_z                42593 non-null  float64
 7   adp_dist                        42593 non-null  float64
 8   post_skeletal_distance_to_soma  42593 non-null  float64
 9   pre_skeletal_distance_to_soma   42593 non-null  float64
 10  pre_oracle                      42593 non-null  float64
 11  pre_test_score                  42593 non-null  float64
 12  pre_rf_x                        

In [18]:
max_depth_list = np.linspace(1,10,dtype=int)
num_features = np.linspace(1,20,dtype = int)

valid_errors = {}

for x in max_depth_list:
    for n in num_features:
        RF = RandomForestClassifier(max_depth=x, max_features = n) 
        RF.fit(X_train_feat.drop("ID", axis = 1),y_train)
        y_hat_valid = RF.predict(X_val_feat.drop("ID", axis = 1))
        valid_errors[(x,n)] = balanced_accuracy_score(y_val, y_hat_valid)
        print(valid_errors[(x,n)])
best_params= max(valid_errors, key=valid_errors.get)

0.638329736854478
0.5872868660699375
0.6964426473759215
0.6941604910450108
0.6725076077281873
0.7026727161008225
0.6891227726014783
0.7188520071918769
0.7261785291022962
0.6934668380871164
0.7022237101719014
0.7103940941007478
0.7175116365410898
0.7045548283336602
0.7139968417997851
0.7148889360853556
0.7282940206580016
0.7169210952534586
0.715123044971928
0.7159866862730158
0.7170136687813206
0.7335968949768727
0.7122616150594918
0.7254312937434237
0.7376538366133977
0.741868364010136
0.7416752728162943
0.7294446236829348
0.7424555817297793
0.7376155750502184
0.7373424750367619
0.7377286574244456
0.7368668805639366
0.741416034513227
0.7417632257740265
0.7246830045703113
0.742271731676197
0.7357844512140751
0.7414285992214744
0.7414285992214744
0.7416124492750569
0.7414537286379694
0.7424614993020507
0.7357168449774403
0.7363140334010476
0.7248966046105184
0.741416034513227
0.7246830045703113
0.741390905096732
0.742033002219495
0.6364324659091093
0.6168150653283766
0.6279143232810263
0

KeyboardInterrupt: 

In [None]:
RF = RandomForestClassifier(max_depth=best_params[0], max_features = best_params[1]) 
RF.fit(X_train_feat.drop("ID", axis = 1),y_train)
y_hat_test= RF.predict(X_query_feat.drop("ID", axis = 1))
test_acc = balanced_accuracy_score(y_query, y_hat_test)
print(f"The test accuracy (using query set) for Random Forest was {test_acc} with a max depth of {best_params[0]} and a number of features at each split of {best_params[1]}")

In [None]:
leaderboard_predictions = RF.predict(sub_data.drop("ID", axis = 1))
sub_data["connected"] = leaderboard_predictions

In [None]:
submission_data = sub_data.filter(['ID','connected'])
submission_data.to_csv('submission_data.csv',index=False)