In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.cluster import SpectralClustering
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import plot_tree


#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

sys.path.insert(0, '/Users/gracewang/Documents/GitHub/elecfinal')
sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')
from ml_pipeline import train_n_predict, validation, clean_split
from Data.data_cleaner import cleaner

# Clean Data

In [2]:
## Clean and split data
train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [3]:
# Oversample X_train_feat
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
X_train, y_train = ros.fit_resample(
        X_train, y_train)

In [4]:
X_train_feat = X_train.copy()
X_val_feat = X_val.copy()
X_query_feat = X_query.copy()

### Minicolumns

In [5]:
# combine coordinates
def coord_column(df, new_col, old_cols):
    df[new_col] = (
        df.filter(regex=old_cols)
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    return df

In [6]:
def coord_df(df):
    df = coord_column(df, "pre_rf_coords_xy", "pre_rf_[xy]")
    df = coord_column(df, "post_rf_coords_xy", "post_rf_[xy]")
    return df

In [7]:
def rfsimilarity(row):
    pre = row["pre_rf_coords_xy"]
    post = row["post_rf_coords_xy"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

In [8]:
X_train_feat = coord_df(X_train_feat)
X_train_feat["rf_similarity"] = X_train_feat.apply(rfsimilarity, axis=1)
X_train_feat.drop(columns=['pre_rf_x', 'pre_rf_y'], inplace=True)

X_val_feat = coord_df(X_val_feat)
X_val_feat["rf_similarity"] = X_val_feat.apply(rfsimilarity, axis=1)
X_val_feat.drop(columns=['pre_rf_x', 'pre_rf_y'], inplace=True)

X_query_feat = coord_df(X_query_feat)
X_query_feat["rf_similarity"] = X_query_feat.apply(rfsimilarity, axis=1)
X_query_feat.drop(columns=['pre_rf_x', 'pre_rf_y'], inplace=True)


In [9]:
X_train_feat.columns

Index(['ID', 'axonal_coor_x', 'axonal_coor_y', 'axonal_coor_z',
       'dendritic_coor_x', 'dendritic_coor_y', 'dendritic_coor_z', 'adp_dist',
       'post_skeletal_distance_to_soma', 'pre_skeletal_distance_to_soma',
       'pre_oracle', 'pre_test_score', 'post_oracle', 'post_test_score',
       'post_rf_x', 'post_rf_y', 'compartment', 'pre_brain_area',
       'post_brain_area', 'pre_nucleus_x', 'pre_nucleus_y', 'pre_nucleus_z',
       'post_nucleus_x', 'post_nucleus_y', 'post_nucleus_z', 'pre_nucleus_id',
       'post_nucleus_id', 'pre_feature_weights', 'post_feature_weights',
       'pre_morph_embeddings', 'post_morph_embeddings', 'me_similarity',
       'fw_similarity', 'axonal_coords', 'dendritic_coords', 'pre_rf_coords',
       'post_rf_coords', 'pre_nucleus_coords', 'post_nucleus_coords',
       'pre_nucleus_xy', 'post_nucleus_xy', 'minicol_dist', 'nuclei_adp_dist',
       'pre_rf_coords_xy', 'post_rf_coords_xy', 'rf_similarity'],
      dtype='object')

### One-Hot Encode Data

In [10]:
def one_hot(column, df, suffix=''):
    """
    one-hot encodes this shit
    """
    cats = pd.unique(df[column])

    for cat in cats:
        new_col = cat+suffix
        df[new_col] = df[column]==cat
        df[new_col] = df[new_col].astype('int')
    
    df = df.drop(columns=column)
    return df

In [11]:
# one-hot encode brain areas for all
X_train_feat = one_hot('pre_brain_area', X_train_feat, '_pre')
X_train_feat = one_hot('post_brain_area', X_train_feat, '_post')

X_val_feat = one_hot('pre_brain_area', X_val_feat, '_pre')
X_val_feat = one_hot('post_brain_area', X_val_feat, '_post')

X_query_feat = one_hot('pre_brain_area', X_query_feat, '_pre')
X_query_feat = one_hot('post_brain_area', X_query_feat, '_post')

In [12]:
# encode brain areas
area1 = ["basal", "soma"]
area2 = ["axon", "apical", "oblique", "apical_shaft"]
area3 = ["apical_tuft"]

def area_cols(df):
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    df.drop(columns='compartment')
    return df

In [13]:
X_train_feat = area_cols(X_train_feat)
X_val_feat = area_cols(X_val_feat)
X_query_feat = area_cols(X_query_feat)

## Select Numerical Data

In [14]:
X_train_feat = X_train_feat.select_dtypes('number')
X_val_feat = X_val_feat.select_dtypes('number')
X_query_feat = X_query_feat.select_dtypes('number')

In [38]:
X_train_feat['minicol_dist']

0        -0.262374
1        -0.262374
2        -0.262374
3        -0.262374
4        -0.262374
            ...   
225027   -0.931315
225028   -0.787386
225029   -0.961426
225030   -0.007819
225031   -1.093150
Name: minicol_dist, Length: 225032, dtype: float64

# XGBoost

## Fit Model

In [73]:
X_train_all = pd.concat([X_train_feat, X_val_feat], axis=0)
y_train_all = pd.concat([y_train, y_val], axis=0)

vf = np.zeros(len(X_train_all))
vf[range(X_train_feat.shape[0], X_train_all.shape[0])] = -1
ps = PredefinedSplit(vf)

In [151]:
param_grid = {"n_estimators":range(59, 60), "max_leaves":range(3, 4)}
model = GridSearchCV(XGBClassifier(grow_policy='lossguide'), param_grid, 
                     n_jobs=-1, return_train_score=True, cv=ps, scoring='balanced_accuracy')
model.fit(X_train_all.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']), y_train_all)
result_df = pd.DataFrame(model.cv_results_).set_index('params')
result_df = result_df[['mean_test_score', 'rank_test_score', 'mean_train_score', 'mean_fit_time', 'mean_score_time']]
result_df

Unnamed: 0_level_0,mean_test_score,rank_test_score,mean_train_score,mean_fit_time,mean_score_time
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"{'max_leaves': 3, 'n_estimators': 59}",0.5,1,0.5,0.182872,0.120928


In [21]:
xg_model = XGBClassifier(n_estimators=13, max_leaves=9)
xg_model.fit(X_train_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1), y_train)
preds = xg_model.predict(X_val_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1))
balanced_accuracy_score(y_val, preds)

0.7783765423031728

In [22]:
xg_model

In [17]:
best_features = pd.DataFrame({"Features":X_train_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).columns, 
              "Importances":abs(xg_model.feature_importances_)}).sort_values(by='Importances', ascending=False)
best_features

Unnamed: 0,Features,Importances
6,adp_dist,0.235003
28,V1_pre,0.058872
11,post_oracle,0.058537
8,pre_skeletal_distance_to_soma,0.054989
26,AL_pre,0.054545
7,post_skeletal_distance_to_soma,0.052007
19,post_nucleus_y,0.045417
14,post_rf_y,0.045226
20,post_nucleus_z,0.041154
18,post_nucleus_x,0.035758


In [18]:
keep_feat_df = best_features[best_features['Importances']>0]
keep_feat_df

Unnamed: 0,Features,Importances
6,adp_dist,0.235003
28,V1_pre,0.058872
11,post_oracle,0.058537
8,pre_skeletal_distance_to_soma,0.054989
26,AL_pre,0.054545
7,post_skeletal_distance_to_soma,0.052007
19,post_nucleus_y,0.045417
14,post_rf_y,0.045226
20,post_nucleus_z,0.041154
18,post_nucleus_x,0.035758


## Cross Validation

In [19]:
n_estimators_list = range(1, 50)
n_leaves = range(1, 10)

In [None]:
accuracies = {}
for depth in n_estimators_list:
    for leaf in n_leaves:
        fold_accuracy = 0
        for fold in [1,2,3,4,5]:      
             #Making  fold
            X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(X_train_feat, y_train, test_size = 0.2, random_state = fold)
            ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
            X_train_fold, y_train_fold = ros.fit_resample(
                    X_train_fold, y_train_fold)      
            RF = XGBClassifier(max_depth=depth, max_leaves = leaf) 
            RF.fit(X_train_fold.drop(columns = ["ID","pre_nucleus_id","post_nucleus_id"]),y_train_fold)
            y_hat_valid = RF.predict(X_val_fold.drop(columns = ["ID","pre_nucleus_id","post_nucleus_id"]))
            valid_acc = balanced_accuracy_score(y_val_fold, y_hat_valid)
            fold_accuracy += valid_acc
            print(f"depth: {depth}, num leaves {leaf}, valid accuracy for this fold, {valid_acc}")
        avg_fold_accuracy = fold_accuracy/5
        print(f"avgfold accuracy: {avg_fold_accuracy}")
        accuracies[(depth, leaf)] = avg_fold_accuracy
    

## Predict on Query Set

In [23]:
preds = xg_model.predict(X_query_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1))
balanced_accuracy_score(y_query, preds)

0.770911791046117

In [24]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data = area_cols(sub_data)
sub_data = one_hot('pre_brain_area', sub_data, '_pre')
sub_data = one_hot('post_brain_area', sub_data, '_post')
sub_data = coord_df(sub_data)
sub_data["rf_similarity"] = sub_data.apply(rfsimilarity, axis=1)
sub_data.drop(columns=['pre_rf_x', 'pre_rf_y'], inplace=True)

In [25]:
sub_data = sub_data.select_dtypes('number')


In [26]:
preds = xg_model.predict(sub_data.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1))


In [27]:
sub_data['connected'] = preds==1

In [28]:
submission_data = sub_data.filter(['ID','connected'])
submission_data.to_csv('submission_data.csv',index=False)


# Logistic Regression

## Fit Model

In [132]:
fw_model = LogisticRegression(max_iter=500, solver='saga').fit(X_train_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1), y_train)
preds = fw_model.predict(X_val_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1))
balanced_accuracy_score(y_val,preds)

0.7588349599353725

In [133]:
log_feats = pd.DataFrame({"features":X_val_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).columns})
log_feats['coefs']=abs(fw_model.coef_[0])
log_feats.sort_values(by='coefs', ascending=False, inplace=True)
log_feats

Unnamed: 0,features,coefs
11,post_oracle,4.624851
13,post_rf_x,4.114352
14,post_rf_y,3.58658
10,pre_test_score,3.56777
6,adp_dist,1.449886
12,post_test_score,1.072842
15,pre_nucleus_x,0.96168
21,me_similarity,0.791657
5,dendritic_coor_z,0.662889
26,AL_pre,0.504612


## Predict on Query Set

In [134]:
preds = xg_model.predict(X_query_feat.drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id']).sort_index(axis=1))
balanced_accuracy_score(y_query, preds)

0.745415352855731

# Model Stacking