In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [2]:
from ml_pipeline import train_n_predict, validation, clean_split

In [3]:
from Data.data_cleaner import cleaner

In [4]:
## Clean data


train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/morph_embeddings.csv"

X_train, X_val, X_query, y_train, y_val, y_query = clean_split(train_path, feature_path, morph_path)

In [5]:
data = cleaner(train_path, feature_path, morph_path)

In [6]:
valid_X = X_val.select_dtypes(include='number').drop(["ID", "ADP_total", "connect_total","connect_rate"], axis = 1)
valid_y = y_val

Using Validation

In [7]:
"""
    Function that outputs a model with optimal hyperparameters
    based on a validation set using grid search

    Inputs:
    model: provided model
    param_grid: dictionary of parameters and values to validate on
    e.g. 
    {'C': [0.001,0.01,0.1,1,10], 
    'gamma':[0.1,1,10,100], 
    'kernel':('linear', 'rbf')}
    valid_X: validation X of data (pandas df)
    valid_y: validation y of data

    Outputs: 
    clf: provided model with optimum hyperparameters
    """
pre_valid_models = [RandomForestClassifier(), LinearDiscriminantAnalysis()]
param_grids = [
    {
    'max_features' : ['sqrt', 'log2'],
    'n_jobs' : [-1],
    'random_state': [1]
    },
    {
    'solver' : ['lsqr', 'eigen'],
    'shrinkage' : ['auto']
    }]
post_valid_models = []

for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)


USING ALL SVDs

In [41]:
pre_valid_models = [SVC()]
param_grids = [
    {
    'kernel':['linear', 'rbf','poly'],
    'C': [0.1, 1],
    'degree': [2, 3, 4, 5]
    }]
post_valid_models = []

for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)



In [44]:
post_valid_models[0]

<bound method BaseEstimator.get_params of SVC(C=1, degree=2, max_iter=1000)>

Training and Predicting

In [42]:
query_X = X_query.select_dtypes(include='number').drop(["ID", "ADP_total", "connect_total","connect_rate"], axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop(["ID", "ADP_total", "connect_total","connect_rate"], axis = 1)
train_y = y_train

In [None]:
"""
    Function that takes in a dataframe of data and outputs 
    a fitted "optimal" model

    Inputs:
    - train: training set
    - query: query set
    - models: dictionary of (model_name : model function) to train and predict on, with optimized 
    parameters already.

    Outputs:
    - best_clf: The optimum classifier function fitted over training data

    - accuracy_score: list of accuracies based on order of models
    passed.
    """

#models = {"SVC": post_valid_models[0]}
#accuracy_score, best_clf = train_n_predict(train_X, train_y, query_X, query_y, models)

In [10]:
#print("this is the best classifier,", best_clf, "with a balanced accuracy score of", accuracy_score)

this is the best classifier, RandomForestClassifier(n_jobs=-1, random_state=1) with a balanced accuracy score of {'RF': 1.0, 'LDA': 0.7443137923390077}


In [46]:
##Fitting SVC
ros = RandomOverSampler(random_state=0, sampling_strategy = 'minority')
train_X_resampled, train_y_resampled = ros.fit_resample(
            train_X, train_y
        )

post_valid_models[0].fit(train_X_resampled, train_y_resampled)



In [52]:
train_X_resampled["pred"] = post_valid_models[0].predict(train_X_resampled)

In [54]:
balanced_accuracy = balanced_accuracy_score(
            train_y_resampled, 
            train_X_resampled["pred"])
balanced_accuracy

0.4593748022527368

In [55]:
leaderboard_path = "../Data/leaderboard_data.csv"
sub_data = cleaner(leaderboard_path, feature_path, morph_path, submission = True)

In [56]:
lb_data = sub_data.select_dtypes(include='number')

In [57]:
#create a boolean prediction solution
lb_data["connected"] = post_valid_models[0].predict(lb_data.drop("ID", axis = 1))

In [58]:
submission_data = lb_data.filter(['ID','connected'])

In [59]:
submission_data.to_csv('submission_data.csv',index=False)