In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer, accuracy_score 
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [2]:
from ml_pipeline import clean_split

In [49]:
train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [50]:
X_train = X_train.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id"], axis = 1)
X_val = X_val.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id"], axis = 1)
X_query = X_query.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id"], axis = 1)

In [51]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102683 entries, 6063 to 185831
Data columns (total 10 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   adp_dist                        102683 non-null  float64
 1   post_skeletal_distance_to_soma  102683 non-null  float64
 2   pre_skeletal_distance_to_soma   102683 non-null  float64
 3   pre_oracle                      102683 non-null  float64
 4   pre_test_score                  102683 non-null  float64
 5   post_oracle                     102683 non-null  float64
 6   post_test_score                 102683 non-null  float64
 7   me_similarity                   102683 non-null  float64
 8   fw_similarity                   102683 non-null  float64
 9   nuclei_adp_dist                 102683 non-null  float64
dtypes: float64(10)
memory usage: 8.6 MB


In [54]:
y_train.shape

(102683,)

In [55]:
sum(y_train)/y_train.shape[0]

0.006505458547179183

In [56]:
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
X_train, y_train = ros.fit_resample(
    X_train, y_train
)

In [57]:
sum(y_train)/y_train.shape[0]

0.5

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207068 entries, 0 to 207067
Data columns (total 10 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   adp_dist                        207068 non-null  float64
 1   post_skeletal_distance_to_soma  207068 non-null  float64
 2   pre_skeletal_distance_to_soma   207068 non-null  float64
 3   pre_oracle                      207068 non-null  float64
 4   pre_test_score                  207068 non-null  float64
 5   post_oracle                     207068 non-null  float64
 6   post_test_score                 207068 non-null  float64
 7   me_similarity                   207068 non-null  float64
 8   fw_similarity                   207068 non-null  float64
 9   nuclei_adp_dist                 207068 non-null  float64
dtypes: float64(10)
memory usage: 15.8 MB


In [48]:
y_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 207068 entries, 0 to 207067
Series name: connected
Non-Null Count   Dtype
--------------   -----
207068 non-null  bool 
dtypes: bool(1)
memory usage: 202.3 KB


In [58]:
models = {"RFC": RandomForestClassifier(), "LDA": LinearDiscriminantAnalysis()}
param_grids = {
            "RFC": [{'n_estimators' : 1000, 'n_jobs' : -1}, 
                    {'n_estimators' : 10000, 'n_jobs' : -1}],
            "LDA": [{'solver' : 'lsqr'},
                    {'solver' : 'eigen'}]
            }

In [59]:
optimum_models = dict()
accuracies = dict()
for model in param_grids:
    classifier = models[model]
    prev_acc = 0
    optimum_param = dict()
    accuracy = []
    for values in param_grids[model]:
        
        #Fitting to the training data with selected hyperparameters
        classifier.set_params(**values)
        print(classifier)
        classifier.fit(X_train, y_train)

        #Finding the balanced accuracy
        y_hat = classifier.predict(X_val)
        print(y_hat)
        print(y_val)
        balanced_accuracy = balanced_accuracy_score(y_val, y_hat)
        accuracy.append(balanced_accuracy)
        print(accuracy)
        if balanced_accuracy > prev_acc:
            print(prev_acc, balanced_accuracy)
            prev_acc = balanced_accuracy
            optimum_param = values
            
        
    accuracies[model] = accuracy
    optimum_models[model] = classifier.set_params(**optimum_param)

RandomForestClassifier(n_estimators=1000, n_jobs=-1)
[False False False ... False False False]
0         False
1         False
2         False
3         False
4         False
          ...  
179187    False
179188    False
179189    False
179190    False
179191    False
Name: connected, Length: 51837, dtype: bool
[0.5011557750528338]
0.5011557750528338 0.5011557750528338
RandomForestClassifier(n_estimators=10000, n_jobs=-1)
[False False False ... False False False]
0         False
1         False
2         False
3         False
4         False
          ...  
179187    False
179188    False
179189    False
179190    False
179191    False
Name: connected, Length: 51837, dtype: bool
[0.5011557750528338, 0.5011557750528338]
LinearDiscriminantAnalysis(solver='lsqr')
[ True  True False ... False False False]
0         False
1         False
2         False
3         False
4         False
          ...  
179187    False
179188    False
179189    False
179190    False
179191    False
Name: con

In [60]:
accuracies

{'RFC': [0.5011557750528338, 0.5011557750528338],
 'LDA': [0.7386763563234151, 0.7386763563234151]}

In [47]:
optimum_models

{'RFC': RandomForestClassifier(n_jobs=-1),
 'LDA': LinearDiscriminantAnalysis(solver='lsqr')}