## Part of the code used for my thesis "Analysis of first-year university student dropout through machine learning models: A comparison between universities"

In [1]:
# Imports
import pandas as pd
import numpy as np
from plotnine import *
import matplotlib.pyplot as plt

#from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import ComplementNB
# from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Neural networks
import tensorflow as tf
from tensorflow.keras import *
from tensorflow.keras import backend as K

In [2]:
# Browser notification for cell execution
%load_ext jupyternotify
# First line in cell
# %%notify

<IPython.core.display.Javascript object>

In [None]:
# Save folds for the entire thesis
# from https://stackoverflow.com/questions/54317242/k-fold-cross-validation-save-folds-for-different-models
skf = StratifiedKFold(n_splits=10, shuffle=True)
X = df.drop(columns=["semestres"])
y = df["semestres"]
folds = {}
count = 1
for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    #X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    #y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    folds['fold_{}'.format(count)] = {}
    folds['fold_{}'.format(count)]['train_index'] = train_index.tolist()
    folds['fold_{}'.format(count)]['test_index'] = test_index.tolist()
    count += 1
print(len(folds) == 10) #assert we have the same number of splits

#print(len(train_index))
#print(len(test_index))

#dump folds to json, dont overwrite!
# import json
# with open('folds_v2.json', 'w') as fp:
#     json.dump(folds, fp)

In [3]:
#load dict to be used
import json
with open('folds.json') as f:
    kfolds = json.load(f)

### Train models over both-universities-dataset, using cross validation and evaluate using 6 different metrics

In [4]:
#data both-universities-dataset
df=pd.read_csv("BD_both.csv", decimal = ".")
df=df.drop(columns=["Unnamed: 0"]) #extra column

# Minor fixes
df.loc[df["region"] == "7.0", "region"] = "7"
df.loc[df["region"] == "6.0", "region"] = "6"
df.loc[df["region"] == "13.0", "region"] = "13"
df.loc[df["region"] == "5.0", "region"] = "5"

df=df.drop(columns=["ID"])
df=df.drop(columns=["year"])

In [5]:
''' Loss and metric for Neural network'''
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

''' Evaluation with all metrics'''
def metrics_evaluate(y_test, y_pred, X_test, model, auc_flag=True):
    f1Score = f1_score(y_test, y_pred)
    f1Score0 = f1_score(y_test, y_pred, pos_label=0)
    acc = accuracy_score(y_test, y_pred)
    if auc_flag == True:
        aucScore = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        aucScore = 0
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    return f1Score,f1Score0,acc,aucScore,prec,rec

In [10]:
%%notify

# For simplicity, dict for metric
scores = {"random": np.zeros(10),
             "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }
scores0 = {"random": np.zeros(10),
             "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }

accuracy = {"random": np.zeros(10),
             "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }

auc = {"random": np.zeros(10),
             "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }

precision = {"random": np.zeros(10),
             "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }

recall = {"random": np.zeros(10),
          "dectree": np.zeros(10),
             "gradboost": np.zeros(10),
             "logreg": np.zeros(10),
             "naive": np.zeros(10),
             "knn": np.zeros(10),
             "svm": np.zeros(10),
             "ranforest": np.zeros(10),
             "neural": np.zeros(10)
         }

frame = pd.DataFrame(columns=["Model",
                              "F1+ Means",
                              "F1+ Error",
                              "F1- Means",
                              "F1- Error",
                              "Acc Means",
                              "Acc Error",
                              "AUC",
                              "AUC Error",
                             "Prec Means",
                              "Prec Error",
                              "Rec Means",
                              "Rec Error"])

var_num = ["nem", "ranking", "mat", "lang", "optional", "pps", "preference"]
var_cat = ["gender", "school", "admission", "commune","region", "university"]

df_oneHot = pd.get_dummies(df, prefix=['gender',
                                'school',
                                'admission',
                                'commune',
                                'region',
                                'university'
                               ],
                              columns=['gender',
                                      'school',
                                       'admission',
                                       'commune',
                                       'region',
                                       'university'])

fold_count = 1
for fold, indexes in kfolds.items():
    aucScore = 0
    print(fold)
    
    train_index = indexes['train_index']
    test_index = indexes['test_index']
    trainSet = df.loc[train_index] # slice using kfold
    testSet = df.loc[test_index]   # slice using kfold
    
    trainSet_oneHot = df_oneHot.loc[train_index] # slice using kfold
    testSet_oneHot = df_oneHot.loc[test_index]   # slice using kfold
    
    # Drop nulls
    trainSet = trainSet.dropna()
    testSet = testSet.dropna()
    
    trainSet_oneHot = trainSet_oneHot.dropna()
    testSet_oneHot = testSet_oneHot.dropna()
    
    #Split X and Y
    #Undersampling
    n = pd.value_counts(trainSet["semestres"])[0]-pd.value_counts(trainSet["semestres"])[1]
    trainSet = trainSet.drop(trainSet[trainSet["semestres"]== 0].sample(n=n, random_state=fold_count).index)
    
    n_oneHot = pd.value_counts(trainSet_oneHot["semestres"])[0]-pd.value_counts(trainSet_oneHot["semestres"])[1]
    trainSet_oneHot = trainSet_oneHot.drop(trainSet_oneHot[trainSet_oneHot["semestres"]== 0].sample(n=n, random_state=fold_count).index)
    
    X_train = trainSet.drop(columns="semestres")
    y_train = trainSet["semestres"]
    X_test = testSet.drop(columns="semestres")
    y_test = testSet["semestres"]
    
    #oneHot
    X_train_oneHot = trainSet_oneHot.drop(columns="semestres")
    y_train_oneHot = trainSet_oneHot["semestres"]
    X_test_oneHot = testSet_oneHot.drop(columns="semestres")
    y_test_oneHot = testSet_oneHot["semestres"]
    
    
    #####
    # MODELS
    #####
    
    # random model
    model = "random"
    print(model)
    y_pred_prob = np.random.uniform(low=0, high=1, size=X_test.shape[0])
    y_pred = [1 if i>=0.5 else 0 for i in y_pred_prob]
    #cmatrix = confusion_matrix(y_test, y_pred)
    #print(cmatrix)

    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test, y_pred_prob, auc_flag=False)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = roc_auc_score(y_test, y_pred_prob)
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    
    # dectree
    model = "dectree"
    print(model)
    dectree = DecisionTreeClassifier(random_state=0, min_samples_leaf=187)
    y_pred = dectree.fit(X_train_oneHot, y_train_oneHot).predict(X_test_oneHot)
    #cmatrix = confusion_matrix(y_test_oneHot.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test_oneHot, y_pred, X_test_oneHot, dectree)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # logreg
    model = "logreg"
    print(model)
    # Optimal variables
    opt_logreg = ["lang", "pps", "gender", "mat", "school", "optional", "admission"]
    X_train_logreg = X_train[opt_logreg]
    X_test_logreg = X_test[opt_logreg]
    
    var_num_use_logreg = list(set(opt_logreg).intersection(set(var_num)))
    var_cat_use_logreg = list(set(opt_logreg).intersection(set(var_cat)))
    
    # ONE HOT ENCODING
    enc_onehot = OneHotEncoder(handle_unknown='ignore')
    enc_onehot.fit(X_train_logreg[var_cat_use_logreg])
    # one hot train
    labels = enc_onehot.fit_transform(X_train_logreg[var_cat_use_logreg]).toarray()
    encoded_vars = enc_onehot.get_feature_names(var_cat_use_logreg)
    X_train_logreg = X_train_logreg.drop(columns=var_cat_use_logreg) # delete duplicated cat
    X_train_logreg[encoded_vars] = labels
    # one hot test
    labels = enc_onehot.fit_transform(X_test_logreg[var_cat_use_logreg]).toarray()
    encoded_vars = enc_onehot.get_feature_names(var_cat_use_logreg)
    X_test_logreg = X_test_logreg.drop(columns=var_cat_use_logreg) # delete duplicated cat
    X_test_logreg[encoded_vars] = labels

    # Fix number of variables
    #print("train",X_train_logreg.shape, "test",X_test_logreg.shape)
    dif = list(set(X_train_logreg.columns) - set(X_test_logreg.columns))
    #print(dif)
    if len(dif) != 0:
        X_test_logreg[dif] = 0
    
    logreg = LogisticRegression(verbose=0, solver='lbfgs')
    y_pred = logreg.fit(X_train_logreg, y_train).predict(X_test_logreg)
    #cmatrix = confusion_matrix(y_test.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test_logreg, logreg)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # naive
    model = "naive"
    print(model)
    gnb = GaussianNB()
    # Optimal variables
    X_train_naive = X_train[['mat', 'pps','optional','ranking','nem']]
    X_test_naive = X_test[['mat', 'pps','optional','ranking','nem']]
    
    y_pred = gnb.fit(X_train_naive, y_train).predict(X_test_naive)
    #cmatrix = confusion_matrix(y_test.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test_naive, gnb)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # KNN
    model = "knn"
    print(model)
    # Optimal variables
    X_train_knn = X_train[['mat', 'lang', 'pps','optional']]
    X_test_knn = X_test[['mat', 'lang', 'pps','optional']]
    
    neigh = KNeighborsClassifier(n_neighbors=29)
    y_pred = neigh.fit(X_train_knn, y_train).predict(X_test_knn)
    cmatrix = confusion_matrix(y_test.tolist(), y_pred.tolist())
    
    # evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test_knn, neigh)
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # SVM
    model = "svm"
    print(model)
    # Selección vars optimas
    X_train_svm = X_train[["nem", "ranking", "mat", "lang", "optional", "pps"]]
    X_test_svm = X_test[["nem", "ranking", "mat", "lang", "optional", "pps"]]
    support = svm.SVC(C=10, kernel="poly", probability=True)
    y_pred = support.fit(X_train_svm, y_train).predict(X_test_svm) 
    #cmatrix = confusion_matrix(y_test.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test_svm, support)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # ranforest
    model = "ranforest"
    print(model)
    ranforest = RandomForestClassifier(n_estimators=500,max_features=20,
                                       min_samples_leaf=100, n_jobs=-1)
    y_pred = ranforest.fit(X_train_oneHot, y_train_oneHot).predict(X_test_oneHot) 
    #cmatrix = confusion_matrix(y_test_oneHot.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test_oneHot, y_pred, X_test_oneHot, ranforest)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    # gradboost
    model = "gradboost"
    print(model)
    gradboost = GradientBoostingClassifier(random_state=0,
                                           min_samples_split=2,
                                           n_estimators=110,
                                           min_samples_leaf = 150,
                                           max_features=6
                                          )
    
    y_pred = gradboost.fit(X_train_oneHot, y_train_oneHot).predict(X_test_oneHot) 
    #cmatrix = confusion_matrix(y_test_oneHot.tolist(), y_pred.tolist())
    #print(cmatrix)
    
    # Evaluate
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test_oneHot, y_pred, X_test_oneHot, gradboost)
    
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
                                             
    # neural
    model = "neural"
    print(model)
    # Optimal variables
    opt_nn = ["lang", "pps", "gender", "mat", "school", "optional", "admission"]
    X_train_nn = X_train[opt_nn]
    X_test_nn = X_test[opt_nn]
    
    var_num_use_nn = list(set(opt_nn).intersection(set(var_num)))
    var_cat_use_nn = list(set(opt_nn).intersection(set(var_cat)))
    
    # ONE HOT ENCODING
    enc_onehot = OneHotEncoder(handle_unknown='ignore')
    enc_onehot.fit(X_train_nn[var_cat_use_nn])
    #one hot train
    labels = enc_onehot.fit_transform(X_train_nn[var_cat_use_nn]).toarray()
    encoded_vars = enc_onehot.get_feature_names(var_cat_use_nn)
    X_train_nn = X_train_nn.drop(columns=var_cat_use_nn) # delete duplicated cat
    X_train_nn[encoded_vars] = labels
    #one hot test
    labels = enc_onehot.fit_transform(X_test_nn[var_cat_use_nn]).toarray()
    encoded_vars = enc_onehot.get_feature_names(var_cat_use_nn)
    X_test_nn = X_test_nn.drop(columns=var_cat_use_nn) # delete duplicated cat
    X_test_nn[encoded_vars] = labels

    # Fix number of variables
    #print("train",X_train_logreg.shape, "test",X_test_logreg.shape)
    dif = list(set(X_train_nn.columns) - set(X_test_nn.columns))
    #print(dif)
    if len(dif) != 0:
        X_test_nn[dif] = 0
    
    # Standarize for speed
    scaler = MinMaxScaler()
    scaler.fit(X_train_nn)
    columns = X_train_nn.columns
    X_train_nn[columns] = scaler.transform(X_train_nn)
    scaler.fit(X_test_nn)
    X_test_nn[columns] = scaler.transform(X_test_nn)

    # Layers
    n_neurons = 15
    n_layers = 3
    inputLayer = layers.Input(shape=(X_train_nn.shape[1],))
    hiddenLayer = layers.Dense(n_neurons, activation='relu',use_bias = True)(inputLayer)
    # Hidden layers
    for i in range(n_layers-1):
        hiddenLayer = layers.Dense(n_neurons, activation='relu',use_bias = True)(hiddenLayer)

    outputLayer = layers.Dense(1, activation='sigmoid',use_bias = True)(hiddenLayer)
    feedForward = models.Model(inputLayer, outputLayer)

    feedForward.compile(loss='binary_crossentropy', optimizer = 'adam',metrics=['binary_accuracy',f1])
    feedForward.fit(X_train_nn, y_train,epochs=200,batch_size=64, verbose = 0)
    y_pred = feedForward.predict(X_test_nn)
    # AUC
    auc[model][fold_count-1] = roc_auc_score(y_test, y_pred)
    y_pred = [1 if i>=0.5 else 0 for i in y_pred]
    
    # Evaluate, not including AUC
    f1Score,f1Score0,acc,aucScore,prec,rec = metrics_evaluate(
        y_test, y_pred, X_test_nn, feedForward, auc_flag=False)
    scores[model][fold_count-1] = f1Score
    scores0[model][fold_count-1] = f1Score0
    accuracy[model][fold_count-1] = acc
    #auc[model][fold_count-1] = aucScore
    precision[model][fold_count-1] = prec
    recall[model][fold_count-1] = rec
    
    fold_count+=1

# dataframe
for model in ["random","knn","svm","dectree","ranforest","gradboost","naive","logreg","neural"]:
    frame = frame.append({"Model": model,
                          "F1+ Means": round(scores[model].mean(),2),
                          "F1+ Error": round(scores[model].std(),2),
                          "F1- Means": round(scores0[model].mean(),2),
                          "F1- Error": round(scores0[model].std(),2),
                          "Acc Means": round(accuracy[model].mean(),2),
                          "Acc Error": round(accuracy[model].std(),2),
                          "AUC": round(auc[model].mean(),2),
                          "AUC Error": round(auc[model].std(),2),
                          "Prec Means": round(precision[model].mean(),2),
                          "Prec Error": round(precision[model].std(),2),
                          "Rec Means": round(recall[model].mean(),2),
                          "Rec Error": round(recall[model].std(),2)},
                        ignore_index = True)
    
frame

fold_1
random
[[88 67]
 [21 24]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_2
random
[[91 84]
 [25 18]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_3
random
[[ 69 101]
 [ 31  23]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_4
random
[[97 85]
 [16 29]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_5
random
[[87 74]
 [26 25]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_6
random
[[97 77]
 [28 22]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_7
random
[[102  76]
 [ 25  20]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_8
random
[[80 76]
 [18 20]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_9
random
[[85 95]
 [22 19]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural
fold_10
random
[[91 84]
 [24 26]]
dectree
logreg
naive
knn
svm


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ranforest
gradboost
neural


Unnamed: 0,Model,F1+ Means,F1+ Error,F1- Means,F1- Error,Acc Means,Acc Error,AUC,AUC Error,Prec Means,Prec Error,Rec Means,Rec Error
0,random,0.3,0.04,0.63,0.04,0.51,0.04,0.5,0.03,0.22,0.03,0.49,0.07
1,knn,0.4,0.03,0.59,0.03,0.51,0.03,0.64,0.04,0.27,0.03,0.77,0.07
2,svm,0.41,0.02,0.63,0.04,0.54,0.03,0.65,0.02,0.28,0.02,0.75,0.05
3,dectree,0.4,0.05,0.67,0.05,0.58,0.04,0.65,0.03,0.29,0.04,0.65,0.08
4,ranforest,0.4,0.04,0.68,0.03,0.58,0.03,0.66,0.03,0.29,0.04,0.65,0.05
5,gradboost,0.4,0.04,0.72,0.02,0.62,0.02,0.65,0.03,0.3,0.04,0.59,0.05
6,naive,0.38,0.02,0.57,0.04,0.49,0.03,0.66,0.02,0.26,0.02,0.74,0.03
7,logreg,0.39,0.04,0.57,0.05,0.5,0.04,0.66,0.04,0.26,0.03,0.76,0.09
8,neural,0.41,0.04,0.69,0.06,0.6,0.06,0.66,0.03,0.3,0.04,0.65,0.09


In [11]:
#frame.to_csv("results_both.csv")