In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [18]:
from ml_pipeline import train_n_predict, validation, clean_split

In [3]:
from Data.data_cleaner import cleaner

In [5]:
## Clean data


train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed_vector.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [33]:
valid_X = X_val.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id","ADP_total"], axis = 1)
valid_y = y_val

In [34]:
valid_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37181 entries, 16988 to 167760
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   adp_dist                        37181 non-null  float64
 1   post_skeletal_distance_to_soma  37181 non-null  float64
 2   pre_skeletal_distance_to_soma   37181 non-null  float64
 3   pre_oracle                      37181 non-null  float64
 4   pre_test_score                  37181 non-null  float64
 5   post_oracle                     37181 non-null  float64
 6   post_test_score                 37181 non-null  float64
 7   me_similarity                   37181 non-null  float64
 8   fw_similarity                   37181 non-null  float64
 9   nuclei_adp_dist                 37181 non-null  float64
dtypes: float64(10)
memory usage: 3.1 MB


Using Validation

In [9]:
"""
    Function that outputs a model with optimal hyperparameters
    based on a validation set using grid search

    Inputs:
    model: provided model
    param_grid: dictionary of parameters and values to validate on
    e.g. 
    {'C': [0.001,0.01,0.1,1,10], 
    'gamma':[0.1,1,10,100], 
    'kernel':('linear', 'rbf')}
    valid_X: validation X of data (pandas df)
    valid_y: validation y of data

    Outputs: 
    clf: provided model with optimum hyperparameters
    """
pre_valid_models = [RandomForestClassifier(), LinearDiscriminantAnalysis()]
param_grids = [
    {
    'max_features' : ['sqrt', 'log2'],
    'n_jobs' : [-1],
    'random_state': [1]
    },
    {
    'solver' : ['lsqr', 'eigen'],
    'shrinkage' : ['auto']
    }]
post_valid_models = []

for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)


(Alternative validation) USING ALL SVDs

In [60]:
pre_valid_models = [SVC()]
param_grids = [
    {
    'kernel':['rbf'],
    'C': [0.1, 1],
    }]
post_valid_models = []

for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)

Training and Predicting

In [35]:
query_X = X_query.select_dtypes(include='number').drop(["ID", "ADP_total", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop(["ID", "ADP_total", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
train_y = y_train

In [36]:
"""
    Function that takes in a dataframe of data and outputs 
    a fitted "optimal" model

    Inputs:
    - train: training set
    - query: query set
    - models: dictionary of (model_name : model function) to train and predict on, with optimized 
    parameters already.

    Outputs:
    - best_clf: The optimum classifier function fitted over training data

    - accuracy_score: list of accuracies based on order of models
    passed.
    """
##Change this according to the models you passed for validation
models = {"RFC": post_valid_models[0], "LDA": post_valid_models[1]}
accuracy_score, best_clf, trained_models = train_n_predict(train_X, train_y, query_X, query_y, models)

In [37]:
print("The accuracies are", accuracy_score)

The accuracies are {'RFC': 1.0, 'LDA': 0.75692203159504}


In [76]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)

In [71]:
lb_data = sub_data.select_dtypes(include='number').drop(["pre_nucleus_id", "post_nucleus_id","ADP_total"], axis = 1)

In [73]:
#create a boolean prediction solution
lb_data["connected"] = best_clf.predict(lb_data.drop("ID", axis = 1))

In [66]:
submission_data = lb_data.filter(['ID','connected'])

In [74]:
submission_data.to_csv('submission_data.csv',index=False)