In [1]:
import pandas as pd

seq = range(99)

names = [f'x{x}' for x in seq]

dataset = pd.read_csv('df_for_modeling.csv')

X_df = dataset.drop(['file_name', 'selected'], axis = 1)
X_df = X_df.set_axis(names, axis=1, inplace=False)
y_df = dataset['selected']
files = dataset['file_name']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X_df,
    y_df,
    test_size=.3,
    random_state=42
)

  X_df = X_df.set_axis(names, axis=1, inplace=False)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def print_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    print("""\nAccuracy: %s
        Sensitivity: %s 
        Specificity: %s 
        TP: %s
        TN: %s 
        FP: %s
        FN: %s""" \
        % (accuracy, sensitivity, specificity, tp, tn, fp, fn))



def preliminar_tests_classifier(x_train, y_train, x_test, y_test):
    functions_dict = {'LogisticRegression':LogisticRegression, 
                        'NaiveBayes': MultinomialNB, 
                        'KnnClassifier':KNeighborsClassifier,
                        'SVM':SVC,
                        'RandomForest':RandomForestClassifier,
                        'XGB':XGBClassifier}
    
    for i in functions_dict:
        model = functions_dict[i]()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print(i)
        print_metrics(y_test, y_pred)
        print('\n\n')

In [None]:
preliminar_tests_classifier(x_train, y_train, x_test, y_test)

# Tunning

In [8]:
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV

# set up the model
gbm = XGBClassifier(random_state=0)

# determine the hyperparameter space
param_grid = dict(
    max_depth=stats.randint(3, 20),
    learning_rate=stats.uniform(0.001, 1),
    n_estimators=stats.randint(50, 200)
)

# set up the search
search = RandomizedSearchCV(gbm,
                            param_grid,
                            scoring='accuracy',
                            cv=3,
                            n_iter = 10,
                            random_state=10,
                            n_jobs=4,
                            refit=True,
                            verbose=10)

# find best hyperparameters
search.fit(x_train, y_train)

KeyboardInterrupt: 