In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


sys.path.insert(0, 'D:\Fall23 Coursework\ELEC478\Competition\elecfinal')

In [2]:
from ml_pipeline import train_n_predict, validation

In [3]:
from Data.data_cleaner import cleaner

In [None]:
## Clean data
train_path = "../Data/train_data.csv"
feature_path = "../Data/feature_weights.csv"
morph_path = "../Data/morph_embeddings.csv"
data = cleaner(train_path, feature_path, morph_path)

In [5]:
X = data.drop(columns='connected')
y = data['connected']
X_train, X_oth, y_train, y_oth = model_selection.train_test_split(X, y, test_size=0.5, random_state=42)
X_val, X_query, y_val, y_query = model_selection.train_test_split(X_oth, y_oth, test_size=0.25, random_state=42)

In [6]:
valid_X = X_val.select_dtypes(include='number').drop("ID", axis = 1)
valid_y = y_val

Using Validation

In [8]:
"""
    Function that outputs a model with optimal hyperparameters
    based on a validation set using grid search

    Inputs:
    model: provided model
    param_grid: dictionary of parameters and values to validate on
    e.g. 
    {'C': [0.001,0.01,0.1,1,10], 
    'gamma':[0.1,1,10,100], 
    'kernel':('linear', 'rbf')}
    valid_X: validation X of data (pandas df)
    valid_y: validation y of data

    Outputs: 
    clf: provided model with optimum hyperparameters
    """
pre_valid_models = [RandomForestClassifier()]
param_grids = [
    {'max_features': ['sqrt', 'log2'],
     'criterion' :['gini', 'entropy']}
]
post_valid_models = []
for i in range(len(pre_valid_models)):
    best_clf = validation(model = pre_valid_models[i], 
                                        param_grid = param_grids[i], 
                                        valid_X = valid_X, 
                                        valid_y = valid_y)
    post_valid_models.append(best_clf)

Training and Predicting

In [19]:
query_X = X_query.select_dtypes(include='number').drop("ID", axis = 1)
query_y = y_query

train_X = X_train.select_dtypes(include='number').drop("ID", axis = 1)
train_y = y_train

In [20]:
"""
    Function that takes in a dataframe of data and outputs 
    a fitted "optimal" model

    Inputs:
    - train: training set
    - query: query set
    - models: dictionary of (model_name : model function) to train and predict on, with optimized 
    parameters already.

    Outputs:
    - best_clf: The optimum classifier function fitted over training data

    - accuracy_score: list of accuracies based on order of models
    passed.
    """

models = {"random forest": post_valid_models[0], "lda": LinearDiscriminantAnalysis()}
accuracy_score, best_clf = train_n_predict(train_X, train_y, query_X, query_y, models)

In [21]:
accuracy_score

{'random forest': 1.0, 'lda': 0.7583929733246584}

In [24]:
clf = models[max(accuracy_score, key=accuracy_score.get)]

In [25]:
clf.fit(train_X, train_y)


In [None]:
leaderboard_data_path = "../Data/leaderboard_data.csv"
test_data = cleaner(leaderboard_data_path, feature_path, morph_path)