In [39]:
import os
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.covariance import OAS

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import ComplementNB, MultinomialNB, GaussianNB
from sklearn.ensemble import BaggingClassifier, StackingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from joblib import dump

In [40]:
#H0: Simpler models tend to outperform more complex models 
#H1: Models using factually relevant data tend to outperform

def train_model(setname, data_frame, target_col="y", folds=3):

    OA = OAS(store_precision=False, assume_centered=False)
    classifiers = {

        'SGR': StackingClassifier(estimators=[('gbc', GradientBoostingClassifier()), ('rc', RidgeClassifier())], 
                                  final_estimator=SVC()),
        'SAG': StackingClassifier(estimators=[('ada', AdaBoostClassifier()), ('gnb', GaussianNB())], 
                                  final_estimator=SVC()),
        
        'BRF': BaggingClassifier(RandomForestClassifier(), n_estimators=10, random_state=232),
        'OAS': LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=OA), 
        'PAC': CalibratedClassifierCV(PassiveAggressiveClassifier(), cv=folds), 

        #Below 80%
        'BSC': BaggingClassifier(SGDClassifier(), n_estimators=10, random_state=232),
        'BlC': BaggingClassifier(SGDClassifier(loss="log_loss"), n_estimators=10, random_state=232),
        
        # Your Model(s) - Do not overfit and do K.I.S.S the environment.
        #1
        #2
        #3
        
    }

    x = data_frame.copy().drop(columns=[target_col]) 
    y = data_frame[target_col]
    
    
    # Upsample minority class
    if len(df_minority_1) > 20:
        df_minority_1_upsampled = resample(df_minority_1, replace=True, n_samples=len(df_majority), random_state=232)
    else:
        # Monte Carlo fallacy 
        df_minority_1_upsampled = pd.DataFrame() 


    # Upsample alternative minority class
    if len(df_minority_4) > 20:
        df_minority_4_upsampled = resample(df_minority_1, replace=True, n_samples=len(df_majority), random_state=232) 
    else:
        # Monte Carlo fallacy 
        df_minority_4_upsampled = pd.DataFrame()    
 

    # Combine majority class with upsampled minority class
    data_frame_upsampled = pd.concat([df_majority, df_minority_1_upsampled, df_minority_4_upsampled])
    x = data_frame_upsampled.copy().drop(columns=[target_col]) 
    y = data_frame_upsampled[target_col]


    #Confirmed balanced Dataset
    #counter = Counter(y)
    #print(counter)
    #print(" ")
    #print(" ")
    

    # Create a correlation matrix
    corr_matrix = x.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]    
    # Drop highly correlated features from X
    x = x.drop(x[to_drop], axis=1)

    # total number of features
    k = x.shape[1]
    
    best_features = []
    best_scores = [] 

    for name, classifier in classifiers.items():
        
        classifier_pipeline = Pipeline([
            ('classifier', classifier)
        ])

        best_k_score = 0
        for k in range(1, k+1):
            bestfeatures = SelectKBest(score_func=f_classif, k=k)
            fit = bestfeatures.fit(x, y)

            dfscores = pd.DataFrame(fit.scores_)
            dfcolumns = pd.DataFrame(x.columns)
                
            # Concatenate dataframes for better visualization 
            featureScores = pd.concat([dfcolumns,dfscores],axis=1)
            featureScores.columns = ['Specs','Score']  
                
            # Select the features with the highest scores
            best_features_name = featureScores.nlargest(k, 'Score')['Specs'].values
            x_best = x[best_features_name]

            # Apply cross-validation and calculate mean score for current classifier with 
            # the selected number of features
            cv_score = cross_val_score(classifier_pipeline, x_best, y, cv=folds, scoring='accuracy').mean()
                
            # If the current score is better than the best_k_score, then update best_k_score 
            # and the final list of best features associated with the classifier
            if cv_score > best_k_score:
                best_k_score = cv_score
                best_k_features_name = best_features_name
                 
        best_features_name = best_k_features_name    
        best_features.append(best_features_name)
        
        x_best = x[best_features_name]
        score = cross_val_score(classifier_pipeline, x_best, y, cv=folds, scoring='accuracy').mean()
        best_scores.append(score)

    best_classifier_index = np.argmax(best_scores)
    best_classifier_name = list(classifiers.keys())[best_classifier_index]
    best_feature_name = best_features[best_classifier_index]
    best_score = best_scores[best_classifier_index]
   
    best_classifier = classifiers[best_classifier_name]
    best_classifier_pipeline = Pipeline([
        ('classifier', best_classifier)
    ])
    
    best_classifier_pipeline.fit(x[best_feature_name], y)

    # Save the model
    filename = f"./models/{setname}.joblib"
    dump(best_classifier_pipeline, filename)

    return best_classifier_name, round(best_score, 4), best_feature_name.tolist()

In [41]:
# List of Dataset Names
datasets = []

In [42]:
def use_data():
    
    selected = pd.DataFrame()
    for setname in datasets:
        
        filename = f"./datasets/{setname}.parquet"        
        existing_data = pd.read_parquet(filename)
        
        data = existing_data[['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6',
        'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'y']]

        bclf, score, features = train_model(setname, data)
        selected = pd.concat([selected, pd.DataFrame([(setname, bclf, score, features)], 
        columns=['setname', 'classifier', 'score', 'features'])], ignore_index=True)

    return selected

In [43]:
selection = use_data()
selected = selection.sort_values(by=["score"], ascending=False).reset_index(drop=True)

In [None]:
filename = f"./models/classifiers.csv" 
selected.to_csv(filename)
print(f"Your CAS: {selected['score'].mean()}")
print(" ")  
print(selected)