In [6]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer, accuracy_score 
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier


#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [3]:
from ml_pipeline import train_n_predict, validation, clean_split
from Data.data_cleaner import cleaner

In [4]:
## Clean data


train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [21]:
area1 = ["basal", "soma"]
area2 = ["axon", "apical", "oblique", "apical_shaft"]
area3 = ["apical_tuft"]

def area_cols(df):
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    return df

In [22]:
X_train = area_cols(X_train)
X_val = area_cols(X_val)
X_query = area_cols(X_query)

In [23]:
valid_X = X_val.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id"], axis = 1)
valid_y = y_val

In [24]:
# define the base models
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))
# define meta learner model
level1 = LogisticRegression()

In [12]:
models = dict()
models['lr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()
models['bayes'] = GaussianNB()


In [25]:
param_grids = [
    {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'n_jobs' : [-1],
    'random_state': [1]
    },
    {
    'n_neighbors' : [2,5,10]
    },
    {
    'criterion' : ['gini','entropy']    
    },
    {
    'kernel' : ['poly', 'rbf']
    },
    {}
    ]

post_valid_models = dict()
i = 0
for model in models:
    best_clf = validation(model = models[model], 
                        param_grid = param_grids[i], 
                        valid_X = valid_X, 
                        valid_y = valid_y)
    post_valid_models[model] = best_clf
    i = i + 1

In [26]:
post_valid_models

{'lr': LogisticRegression(n_jobs=-1, random_state=1),
 'knn': KNeighborsClassifier(n_neighbors=2),
 'cart': DecisionTreeClassifier(criterion='entropy'),
 'svm': SVC(),
 'bayes': GaussianNB()}

In [27]:
level0 = list()
level0.append(('lr', post_valid_models['lr']))
level0.append(('knn', post_valid_models['knn']))
level0.append(('cart', post_valid_models['cart']))
level0.append(('svm', post_valid_models['svm']))
level0.append(('bayes', post_valid_models['bayes']))
# define meta learner model
level1 = LogisticRegression()

In [28]:
#Validated stack
stack = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

In [29]:
query_X = X_query.select_dtypes(include='number').drop(["ID", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop(["ID", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
train_y = y_train

In [39]:
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
train_X_resampled, train_y_resampled = ros.fit_resample(
        train_X, train_y
    )

X_query_resampled, y_query_resampled = ros.fit_resample(
    query_X, query_y
)

In [40]:
stack.fit(X_query_resampled, y_query_resampled)

In [None]:
print("stack accuracy",stack.score(X_query_resampled, y_query_resampled))

stack accuracy 0.9927599092486321


In [None]:
stack.fit(train_X_resampled, train_y_resampled)

In [None]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data = area_cols(sub_data)

In [None]:
lb_data = sub_data.select_dtypes(include='number').drop(["pre_nucleus_id", "post_nucleus_id"], axis = 1)

In [None]:
lb_data

Unnamed: 0,ID,adp_dist,post_skeletal_distance_to_soma,pre_skeletal_distance_to_soma,pre_oracle,pre_test_score,post_oracle,post_test_score,me_similarity,fw_similarity,nuclei_adp_dist,area1,area2,area3
0,0,1.569857,1.066950,-1.520389,0.371616,-0.365468,0.880813,0.454349,-0.824571,-1.613455,3.221214,0,1,0
1,1,1.072992,1.548930,0.949414,0.371616,-0.365468,1.170133,-0.223748,-0.824571,1.856578,-0.818251,0,1,0
2,2,0.283729,-0.977701,-1.010610,0.371616,-0.365468,-0.889101,0.590201,-0.824571,-0.923427,2.449958,1,0,0
3,3,0.531271,-0.742668,-0.920656,0.371616,-0.365468,-0.889101,0.590201,-0.824571,-0.923427,2.283731,0,1,0
4,4,1.334973,-0.020021,-1.344421,0.371616,-0.365468,0.581479,0.526492,-0.824571,-1.314208,3.030948,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42588,42588,-0.620528,-0.368073,-0.917464,0.430932,0.065340,-0.040713,-0.247062,1.202609,-0.329380,-0.024980,1,0,0
42589,42589,0.073549,0.489645,-0.608341,0.430932,0.065340,-0.040713,-0.247062,1.202609,-0.329380,-1.069079,0,1,0
42590,42590,0.101911,-0.629141,-1.047202,0.430932,0.065340,-0.040713,-0.247062,1.202609,-0.329380,-0.270275,1,0,0
42591,42591,0.923462,0.536486,-0.695932,0.430932,0.065340,-0.040713,-0.247062,1.202609,-0.329380,-1.130779,0,1,0


In [None]:
#create a boolean prediction solution
lb_data["connected"] = stack.predict(lb_data.drop("ID", axis = 1))
submission_data = lb_data.filter(['ID','connected'])
submission_data.to_csv('submission_data.csv',index=False)