In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer, accuracy_score 
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier


#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [2]:
from ml_pipeline import train_n_predict, validation, clean_split
from Data.data_cleaner import cleaner

In [3]:
## Clean data


train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/imputed_morph_embed.csv"
X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [4]:
area1 = ["basal", "soma"]
area2 = ["axon", "apical", "oblique", "apical_shaft"]
area3 = ["apical_tuft"]

def area_cols(df):
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    return df

In [5]:
X_train = area_cols(X_train)
X_val = area_cols(X_val)
X_query = area_cols(X_query)

In [6]:
valid_X = X_val.select_dtypes(include='number').drop(["ID","pre_nucleus_id", "post_nucleus_id"], axis = 1)
valid_y = y_val

In [7]:
models = dict()
models['lr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()
models['bayes'] = GaussianNB()


In [8]:
param_grids = [
    {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'n_jobs' : [-1],
    'random_state': [1]
    },
    {
    'n_neighbors' : [2,5,10]
    },
    {
    'criterion' : ['gini','entropy']    
    },
    {
    'kernel' : ['poly', 'rbf']
    },
    {}
    ]

post_valid_models = dict()
i = 0
for model in models:
    best_clf = validation(model = models[model], 
                        param_grid = param_grids[i], 
                        valid_X = valid_X, 
                        valid_y = valid_y)
    post_valid_models[model] = best_clf
    i = i + 1

10 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\86185\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\86185\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\86185\AppData\Loc

In [9]:
post_valid_models

{'lr': LogisticRegression(n_jobs=-1, random_state=1),
 'knn': KNeighborsClassifier(n_neighbors=2),
 'cart': DecisionTreeClassifier(),
 'svm': SVC(),
 'bayes': GaussianNB()}

In [10]:
level0 = list()
level0.append(('lr', post_valid_models['lr']))
level0.append(('knn', post_valid_models['knn']))
level0.append(('cart', post_valid_models['cart']))
level0.append(('svm', post_valid_models['svm']))
level0.append(('bayes', post_valid_models['bayes']))
# define meta learner model
level1 = LogisticRegression()

In [11]:
#Validated stack
stack = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

In [12]:
query_X = X_query.select_dtypes(include='number').drop(["ID", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop(["ID", "pre_nucleus_id", "post_nucleus_id"], axis = 1)
train_y = y_train

In [13]:
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
train_X_resampled, train_y_resampled = ros.fit_resample(
        train_X, train_y
    )

X_query_resampled, y_query_resampled = ros.fit_resample(
    query_X, query_y
)

In [14]:
#print("stack accuracy",stack.score(X_query_resampled, y_query_resampled))

In [15]:
stack.fit(train_X_resampled, train_y_resampled)

In [16]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)
sub_data = area_cols(sub_data)

In [17]:
lb_data = sub_data.select_dtypes(include='number').drop(["pre_nucleus_id", "post_nucleus_id"], axis = 1)

In [18]:
#create a boolean prediction solution
lb_data["connected"] = stack.predict(lb_data.drop("ID", axis = 1))
submission_data = lb_data.filter(['ID','connected'])
submission_data.to_csv('submission_data.csv',index=False)