In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

sys.path.insert(0, '/Users/gracewang/Documents/GitHub/elecfinal')
# sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [2]:
from ml_pipeline import train_n_predict, validation, clean_split

In [3]:
from Data.data_cleaner import cleaner

In [5]:
## Clean data


train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/morph_embeddings.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

            ID  adp_dist  post_skeletal_distance_to_soma  \
0        42593   304.185                        353043.0   
1        42594   725.431                        244156.0   
2        42595  3423.030                        363829.0   
3        42596  3442.390                        344267.0   
4        42597  4442.380                        313630.0   
...        ...       ...                             ...   
185827  228420  3162.000                         86928.2   
185828  228421  3503.660                        114862.0   
185829  228422  3939.260                        140923.0   
185830  228423  4192.360                        110553.0   
185831  228424  4838.850                        363943.0   

        pre_skeletal_distance_to_soma  pre_oracle  pre_test_score  \
0                           1182170.0    0.831680        0.704509   
1                            914243.0    0.831680        0.704509   
2                           1171820.0    0.831680        0.704509   
3  

In [9]:
valid_X = X_val.select_dtypes(include='number').drop("ID", axis = 1)
valid_y = y_val

Using Validation

In [10]:
"""
    Function that outputs a model with optimal hyperparameters
    based on a validation set using grid search

    Inputs:
    model: provided model
    param_grid: dictionary of parameters and values to validate on
    e.g. 
    {'C': [0.001,0.01,0.1,1,10], 
    'gamma':[0.1,1,10,100], 
    'kernel':('linear', 'rbf')}
    valid_X: validation X of data (pandas df)
    valid_y: validation y of data

    Outputs: 
    clf: provided model with optimum hyperparameters
    """
pre_valid_models = [RandomForestClassifier(), LinearDiscriminantAnalysis()]
param_grids = [
    {'max_features': ['sqrt', 'log2'],
     'criterion' :['gini', 'entropy']},
     {'shrinkage': [None, 0.5],
      'solver':['svd','eigen']}
]
post_valid_models = []
for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)

5 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 621, in fit
    raise Not

Training and Predicting

In [11]:
query_X = X_query.select_dtypes(include='number').drop("ID", axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop("ID", axis = 1)
train_y = y_train

In [9]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120329 entries, 0 to 185831
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   adp_dist                        120329 non-null  float64
 1   post_skeletal_distance_to_soma  120329 non-null  float64
 2   pre_skeletal_distance_to_soma   120329 non-null  float64
 3   pre_oracle                      120329 non-null  float64
 4   pre_test_score                  120329 non-null  float64
 5   post_oracle                     120329 non-null  float64
 6   post_test_score                 120329 non-null  float64
 7   pre_nucleus_id                  120329 non-null  int64  
 8   post_nucleus_id                 120329 non-null  int64  
 9   fw_similarity                   120329 non-null  float64
 10  nuclei_adp_dist                 120329 non-null  float64
 11  ADP_total                       120329 non-null  float64
 12  connect_total        

In [10]:
"""
    Function that takes in a dataframe of data and outputs 
    a fitted "optimal" model

    Inputs:
    - train: training set
    - query: query set
    - models: dictionary of (model_name : model function) to train and predict on, with optimized 
    parameters already.

    Outputs:
    - best_clf: The optimum classifier function fitted over training data

    - accuracy_score: list of accuracies based on order of models
    passed.
    """

models = {"random forest": post_valid_models[0], "lda": post_valid_models[1]}
accuracy_score, best_clf = train_n_predict(train_X, train_y, query_X, query_y, models)

In [14]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              42593 non-null  int64  
 1   adp_dist                        42593 non-null  float64
 2   post_skeletal_distance_to_soma  42593 non-null  float64
 3   pre_skeletal_distance_to_soma   42593 non-null  float64
 4   pre_oracle                      42593 non-null  float64
 5   pre_test_score                  42593 non-null  float64
 6   post_oracle                     42593 non-null  float64
 7   post_test_score                 42593 non-null  float64
 8   compartment                     42593 non-null  object 
 9   pre_brain_area                  42593 non-null  object 
 10  post_brain_area                 42593 non-null  object 
 11  pre_nucleus_id                  42593 non-null  int64  
 12  post_nucleus_id                 

In [None]:
lb_data = sub_data.select_dtypes(include='number')

In [18]:
connect_info = data[["ID","ADP_total", "connect_total","connect_rate"]]
lb_data = lb_data.merge(connect_info, how = 'left', left_on="ID", right_on="ID")
lb_data.info()

MergeError: Passing 'suffixes' which cause duplicate columns {'connect_rate_x', 'connect_total_x', 'ADP_total_x'} is not allowed.

In [None]:
#create a boolean prediction solution
lb_data["connected"] = best_clf.predict(lb_data.drop("ID", axis = 1))

In [None]:
submission_data = lb_data.filter(['ID','connected'])

In [None]:
submission_data.to_csv('submission_data.csv',index=False)