In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import TextClassificationPipeline

In [2]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfo = dfo.rename(columns={'tweets' : 'text', 'class' : 'label'})
dfr = pd.read_csv('data/romney_cleaned.csv')
dfr = dfr.rename(columns={'tweets' : 'text', 'class' : 'label'})
df = pd.concat([dfo, dfr], ignore_index = True)
df.info()
pd.set_option('display.max_rows', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  object
 1   label   11271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.2+ KB


In [3]:
df = df.astype({'text' : 'string'})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  string
 1   label   11271 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 176.2 KB


# Pre-trained model: BERTweet

Fine-tuning using our data:

In [5]:
Xo0 = dfo['text']
yo0 = dfo['label'].map({1 : 2, 0 : 1, -1 : 0})
Xo, Xo_test, yo, yo_test = train_test_split(Xo0, yo0, test_size = 0.2, random_state = 27)
Xo_train, Xo_eval, yo_train, yo_eval = train_test_split(Xo, yo, test_size = 0.25, random_state = 27)
Xr0 = dfr['text']
yr0 = dfr['label'].map({1 : 2, 0 : 1, -1 : 0})
Xr, Xr_test, yr, yr_test = train_test_split(Xr0, yr0, test_size = 0.2, random_state = 27)
Xr_train, Xr_eval, yr_train, yr_eval = train_test_split(Xr, yr, test_size = 0.25, random_state = 27)


traindf_o = pd.concat([Xo_train,yo_train], axis = 1)
traindf_r = pd.concat([Xr_train,yr_train], axis = 1)
evaldf_o = pd.concat([Xo_eval,yo_eval], axis = 1)
evaldf_r = pd.concat([Xr_eval,yr_eval], axis = 1)
testdf_o = pd.concat([Xo_test,yo_test], axis = 1)
testdf_r = pd.concat([Xr_test,yr_test], axis = 1)


In [6]:
train_o = Dataset.from_pandas(traindf_o, split = 'train')
eval_o = Dataset.from_pandas(evaldf_o, split = 'eval')
test_o = Dataset.from_pandas(testdf_o, split = 'test')
train_r = Dataset.from_pandas(traindf_r, split = 'train')
eval_r = Dataset.from_pandas(evaldf_r, split = 'eval')
test_r = Dataset.from_pandas(testdf_r, split = 'test')


In [7]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    

tokenized_train_o = train_o.map(tokenize_function, batched=True)
tokenized_eval_o = eval_o.map(tokenize_function, batched=True)
tokenized_test_o = test_o.map(tokenize_function, batched=True)
tokenized_train_r = train_r.map(tokenize_function, batched=True)
tokenized_eval_r = eval_o.map(tokenize_function, batched=True)
tokenized_test_r = test_r.map(tokenize_function, batched=True)

Map:   0%|          | 0/3374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/3387 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

In [8]:
model_o = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="checkpoints/test_trainer_o", evaluation_strategy="epoch", num_train_epochs=2)
trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_eval_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.735206,0.702222
2,0.765500,0.783975,0.711111


TrainOutput(global_step=844, training_loss=0.6470462925626204, metrics={'train_runtime': 465.6619, 'train_samples_per_second': 14.491, 'train_steps_per_second': 1.812, 'total_flos': 443872335707136.0, 'train_loss': 0.6470462925626204, 'epoch': 2.0})

In [9]:
trainer_o.save_model('models/model_obama')

In [10]:
model_r = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=3)
trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.109888,0.496889
2,0.803200,1.178195,0.551111
3,0.502900,1.480066,0.6


TrainOutput(global_step=1272, training_loss=0.5905224062361807, metrics={'train_runtime': 886.4822, 'train_samples_per_second': 11.462, 'train_steps_per_second': 1.435, 'total_flos': 668373859383552.0, 'train_loss': 0.5905224062361807, 'epoch': 3.0})

In [11]:
trainer_r.save_model('models/model_romney')

# Load from checkpoints

In [None]:
model_o = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_o/checkpoint-1000", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="checkpoints/test_trainer_o", evaluation_strategy="epoch", num_train_epochs=5)
trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_eval_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

In [None]:
trainer_o.save_model('models/model_obama')

In [None]:
model_r = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_r/checkpoint-1000", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=5)
trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()

In [15]:
trainer_r.save_model('models/model_romney')

# Load finetuned models

In [12]:
model_o = AutoModelForSequenceClassification.from_pretrained('models/model_obama')
model_r = AutoModelForSequenceClassification.from_pretrained('models/model_romney')

In [13]:

pipe_o = TextClassificationPipeline(model=model_o, tokenizer=tokenizer)
pipe_r = TextClassificationPipeline(model=model_r, tokenizer=tokenizer)


In [14]:
pos = list()
neg = list()
neu = list()
pred_o = pd.DataFrame()
for t in test_o['text']:
    pred = pipe_o(t, top_k=None)
    for l in pred:
        if l['label'] == 'POS':
            pos.append(l['score'])
            
        elif l['label'] == 'NEG':
            neg.append(l['score'])
          
        else: 
            neu.append(l['score'])
            

pred_o['pos'] = pos
pred_o['neg'] = neg
pred_o['neu'] = neu


In [15]:
pred_o['class'] = list(map({0 : -1, 1 : 0, 2 : 1}.get, test_o['label']))

In [16]:
pos = list()
neg = list()
neu = list()
pred_r = pd.DataFrame()
for t in test_r['text']:
    pred = pipe_r(t, top_k=None)
    for l in pred:
        if l['label'] == 'POS':
            pos.append(l['score'])
        elif l['label'] == 'NEG':
            neg.append(l['score'])
        else: 
            neu.append(l['score'])

pred_r['pos'] = pos
pred_r['neg'] = neg
pred_r['neu'] = neu


In [17]:
pred_r['class'] = list(map({0 : -1, 1 : 0, 2 : 1}.get, test_r['label']))

# Predict label using maximum probability


In [18]:
def pred_label(df):
    preds = list()
    for idx, row in df.iterrows():
        if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
            preds.append(1)
        elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
            preds.append(-1)
        elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
            preds.append(0)
    df['pred'] = preds
    acc = accuracy_score(df['class'], df['pred'])
    prec = precision_score(df['class'], df['pred'], average = None, zero_division = np.nan)
    rec = recall_score(df['class'], df['pred'], average = None)
    f1 = f1_score(df['class'], df['pred'], average = None)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    return df

In [19]:
predicted_labels_o = pred_label(pred_o)


Accuracy: 0.7288888888888889
Precision: [0.72093023 0.66917293 0.80825959]
Recall: [0.73614776 0.68286445 0.77183099]
F1: [0.72845953 0.67594937 0.78962536]


In [20]:
predicted_labels_r = pred_label(pred_r)

Accuracy: 0.7176991150442478
Precision: [0.74516129 0.67128028 0.70135747]
Recall: [0.83544304 0.52291105 0.75242718]
F1: [0.78772379 0.58787879 0.72599532]


In [21]:
predicted_labels_o['pred'].value_counts()

 0    399
-1    387
 1    339
Name: pred, dtype: int64

In [22]:
predicted_labels_r['pred'].value_counts()

-1    620
 0    289
 1    221
Name: pred, dtype: int64

# Predict label using ML models

In [26]:
def test_model(model, parameters, X, y, n_splits):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state = 27)
    avg_accuracies = list()
    avg_precisions = list()
    avg_recalls = list()
    avg_f1s = list()
    confs = list()
    for conf in ParameterGrid(parameters):
        print('Testing', conf)
        accuracies = list()
        precisions = list()
        recalls = list()
        f1s = list()
        i = 1
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            try:
                model.set_params(**conf)
                model.fit(X_train, y_train)
            except:
                print('Skipped', conf)
                break
            print('\tFold', i, 'of', n_splits)
            y_pred = model.predict(X_test)
            accuracies.append(accuracy_score(y_test, y_pred))
            precisions.append(precision_score(y_test, y_pred, average=None, zero_division = np.nan))
            recalls.append(recall_score(y_test, y_pred, average=None, zero_division = np.nan))
            f1s.append(f1_score(y_test, y_pred, average=None, zero_division = np.nan))
            if i == 1:
                confs.append(conf)
            i = i + 1
            
    
        if len(accuracies) != 0:  
            avg_accuracies.append(sum(accuracies)/len(accuracies))
            avg_precisions.append((sum(precisions)/len(precisions)) if len(precisions) > 0 else np.nan)
            avg_recalls.append(sum(recalls)/len(recalls) if len(recalls) > 0 else np.nan)
            avg_f1s.append(sum(f1s)/len(f1s) if len(f1s) > 0 else np.nan)
        
    results = {'Parameters' : confs,
              'Accuracy' : avg_accuracies,
              'Precision' : avg_precisions,
              'Recall' : avg_recalls,
              'F1' : avg_f1s}
    
    return pd.DataFrame.from_dict(results)
    
        
        

In [27]:
Xo = predicted_labels_o[['pos', 'neg', 'neu']]
yo = predicted_labels_o['class']
Xo_train, Xo_test, yo_train, yo_test = train_test_split(Xo, yo, test_size = 0.2)

Xr = predicted_labels_r[['pos', 'neg', 'neu']]
yr = predicted_labels_r['class']
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size = 0.2)

In [28]:
params_svm = {'C' : (0.1, 1, 10, 100),
             'kernel' : ('rbf', 'poly', 'linear', 'sigmoid'),
             'gamma' : ('scale', 'auto')}
svm = SVC()
svm_results_o = test_model(svm, params_svm, Xo_train, yo_train, 4)

Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'gamma': 'scale', '

In [29]:
svm_results_o

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.716667,"[0.7292874965606811, 0.6432898496851985, 0.798...","[0.7175261084710598, 0.6948214285714286, 0.738...","[0.7224676750506649, 0.6660443189899791, 0.766..."
1,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}",0.715556,"[0.7321517236151383, 0.6457036079577385, 0.790...","[0.7152230960090836, 0.6887797619047619, 0.746...","[0.7217016392355758, 0.6629211235021967, 0.767..."
2,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}",0.712222,"[0.7357288819407573, 0.6272415979708307, 0.803...","[0.7230824817415926, 0.7012797619047619, 0.711...","[0.7284650691658211, 0.6600307295254968, 0.754..."
3,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.718889,"[0.7322322143750715, 0.6406484257871063, 0.810...","[0.7415458317563115, 0.6999702380952381, 0.711...","[0.7359194618299592, 0.6670389624887882, 0.757..."
4,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}",0.713333,"[0.7325513492695874, 0.6328598778274535, 0.802...","[0.7201755049974066, 0.7012797619047619, 0.717...","[0.7254489338679205, 0.6631606121269179, 0.756..."
5,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}",0.37,"[nan, nan, nan]","[0.6633333333333333, 0.4642857142857143, 0.0]","[nan, nan, nan]"
6,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}",0.712222,"[0.7357288819407573, 0.6272415979708307, 0.803...","[0.7230824817415926, 0.7012797619047619, 0.711...","[0.7284650691658211, 0.6600307295254968, 0.754..."
7,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.714444,"[0.7290932159039265, 0.6346229795301905, 0.805...","[0.726415815074926, 0.6977083333333334, 0.7179...","[0.7271208249147072, 0.6628924162257496, 0.758..."
8,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.717778,"[0.7144401096202659, 0.6658473980309423, 0.776...","[0.7444202167178323, 0.639077380952381, 0.7727...","[0.7281947229278358, 0.6505131257373549, 0.774..."
9,"{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}",0.723333,"[0.7292381531511967, 0.6688197601234511, 0.780...","[0.7426649564741999, 0.6622916666666667, 0.767...","[0.7343448820206048, 0.662192267242816, 0.7726..."


In [30]:
svm_r = SVC()
svm_results_r = test_model(svm_r, params_svm, Xr_train, yr_train, 4)

Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'gamma': 'scale', '

In [44]:
svm_results_r

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.706858,"[0.7368487100021192, 0.655128205128205, 0.6924...","[0.8323921384669049, 0.5136363636363637, 0.718...","[0.7812955756196317, 0.5741868022311165, 0.704..."
1,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}",0.705752,"[0.7383763959010375, 0.635770013568521, 0.7113...","[0.8323921384669049, 0.5263419913419913, 0.688...","[0.7821973666067713, 0.5749592999593, 0.699168..."
2,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}",0.709071,"[0.7382635722068435, 0.6603183119263909, 0.691...","[0.8323921384669049, 0.5136363636363637, 0.729...","[0.7821320913497644, 0.5763392099507207, 0.709..."
3,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.711283,"[0.7422558316404394, 0.6656161172350364, 0.688...","[0.8348328928060237, 0.517965367965368, 0.7298...","[0.7852165925808968, 0.5802843555900621, 0.707..."
4,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}",0.706858,"[0.7368487100021192, 0.6561149883682286, 0.690...","[0.8323921384669049, 0.5103030303030303, 0.723...","[0.7812955756196317, 0.5724872646733112, 0.705..."
5,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}",0.487832,"[0.48783185840707965, nan, nan]","[1.0, 0.0, 0.0]","[0.6557179886328482, nan, nan]"
6,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}",0.709071,"[0.7382635722068435, 0.6603183119263909, 0.691...","[0.8323921384669049, 0.5136363636363637, 0.729...","[0.7821320913497644, 0.5763392099507207, 0.709..."
7,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.709071,"[0.737348021296885, 0.6615794692425455, 0.6918...","[0.8346443907191571, 0.5103030303030303, 0.729...","[0.7826017407189003, 0.5746611777167895, 0.709..."
8,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.706858,"[0.7333760852848562, 0.660413623967437, 0.6924...","[0.8369808393172878, 0.5065151515151515, 0.718...","[0.7813832613411682, 0.5718500660301251, 0.704..."
9,"{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}",0.706858,"[0.7332735920548414, 0.660066913013083, 0.6924...","[0.8368765335763, 0.5065151515151515, 0.718062...","[0.7813471993144823, 0.5719729054289993, 0.704..."


In [31]:
params_rf = {'n_estimators' : (50, 100, 150),
            'criterion' : ('entropy', 'gini'),
            'max_features' : (None, 'sqrt')}
rf = RandomForestClassifier()
rf_results_o = test_model(rf, params_rf, Xo_train, yo_train, 4)

Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold

In [32]:
rf_results_o

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'criterion': 'entropy', 'max_features': None,...",0.644444,"[0.6698928886428887, 0.588768814476577, 0.6783...","[0.6850643424870684, 0.576875, 0.6700662878787...","[0.6766899214612189, 0.5825007104290991, 0.673..."
1,"{'criterion': 'entropy', 'max_features': None,...",0.656667,"[0.6744050033806627, 0.5899498182392919, 0.713...","[0.6943004331553052, 0.5936011904761904, 0.683...","[0.6838106973347937, 0.5910893529227443, 0.697..."
2,"{'criterion': 'entropy', 'max_features': None,...",0.638889,"[0.6608131286298613, 0.5737938596491228, 0.686...","[0.676085832036671, 0.5733333333333333, 0.6683...","[0.6680583446331692, 0.572975650569981, 0.6771..."
3,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.655556,"[0.6668915099187994, 0.6032261870623938, 0.703...","[0.6928915569760433, 0.5875892857142857, 0.686...","[0.6781009344700405, 0.5933218099339673, 0.694..."
4,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.656667,"[0.6669767732267733, 0.610122076331507, 0.6932...","[0.6944886945063572, 0.5842559523809525, 0.692...","[0.6796640726202052, 0.5963125236287876, 0.691..."
5,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.657778,"[0.6692234350743805, 0.6028242784669338, 0.705...","[0.6908977809551845, 0.5933928571428572, 0.689...","[0.6791485996913263, 0.5973978086736998, 0.696..."
6,"{'criterion': 'gini', 'max_features': None, 'n...",0.647778,"[0.6676470588235294, 0.5882057856943701, 0.689...","[0.6721573657428823, 0.5802678571428571, 0.691...","[0.6692994397799032, 0.5838897127717003, 0.690..."
7,"{'criterion': 'gini', 'max_features': None, 'n...",0.652222,"[0.6688290357313506, 0.5951774398323884, 0.696...","[0.6850643424870684, 0.5897916666666667, 0.680...","[0.6765613430491256, 0.5919061807425912, 0.688..."
8,"{'criterion': 'gini', 'max_features': None, 'n...",0.644444,"[0.6661260186841582, 0.5827203425229741, 0.690...","[0.6754213802094285, 0.5884821428571428, 0.671...","[0.6701515151515152, 0.5848452969208856, 0.680..."
9,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.645556,"[0.6479546338689888, 0.5960365853658537, 0.695...","[0.691393456411119, 0.5630952380952381, 0.6836...","[0.6684878962272433, 0.5783828556192375, 0.689..."


In [33]:
rf_r = RandomForestClassifier()
rf_results_r = test_model(rf_r, params_rf, Xr_train, yr_train, 4)

Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold

In [34]:
rf_results_r

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'criterion': 'entropy', 'max_features': None,...",0.64823,"[0.6993019169442029, 0.5769397510229638, 0.627...","[0.7665440181503266, 0.507965367965368, 0.5935...","[0.7287268290550252, 0.5365429254138933, 0.603..."
1,"{'criterion': 'entropy', 'max_features': None,...",0.652655,"[0.7088564561344306, 0.5767560664112388, 0.640...","[0.7693416672680691, 0.5167965367965368, 0.593...","[0.7340462090488735, 0.5408207295721544, 0.609..."
2,"{'criterion': 'entropy', 'max_features': None,...",0.65708,"[0.7164175792513288, 0.5840703748542732, 0.635...","[0.762352430868786, 0.5336580086580087, 0.6031...","[0.7345557238681586, 0.552944507564733, 0.6152..."
3,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.654867,"[0.7096816028192232, 0.5762794260839682, 0.633...","[0.7733007749070833, 0.518961038961039, 0.5871...","[0.7378252467152872, 0.5427309136587548, 0.605..."
4,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.664823,"[0.721996095737198, 0.5857289227166277, 0.6413...","[0.782770984435704, 0.5322077922077921, 0.5937...","[0.7489290010191658, 0.554555734629146, 0.6139..."
5,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.655973,"[0.711087004217352, 0.5836257012395734, 0.6348...","[0.7667928484225213, 0.5228354978354979, 0.605...","[0.7353974423173503, 0.5471840027718353, 0.616..."
6,"{'criterion': 'gini', 'max_features': None, 'n...",0.660398,"[0.710973667442442, 0.5929204040675975, 0.6322...","[0.7985606183620202, 0.493008658008658, 0.5996...","[0.7495656658703047, 0.5324544926787724, 0.612..."
7,"{'criterion': 'gini', 'max_features': None, 'n...",0.653761,"[0.7084939957265656, 0.5749923725922617, 0.628...","[0.7827307656454852, 0.5068831168831169, 0.579...","[0.7413589466422614, 0.5356259714235461, 0.599..."
8,"{'criterion': 'gini', 'max_features': None, 'n...",0.650442,"[0.7088002473589098, 0.5694743739811395, 0.635...","[0.7668770447683999, 0.5133766233766234, 0.591...","[0.7337943930426167, 0.5354384395694065, 0.609..."
9,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.643805,"[0.7086472868223307, 0.5519606167194957, 0.620...","[0.76214381938681, 0.5022943722943722, 0.58748...","[0.7315246138822192, 0.5224111452391399, 0.599..."


In [35]:
params_knn = {'n_neighbors' : (1, 3, 5, 7, 9),
             'metric' : ('minkowski', 'euclidean', 'manhattan', 'cosine')}
knn = KNeighborsClassifier()
knn_results_o = test_model(knn, params_knn, Xo_train, yo_train, 4)

Testing {'metric': 'minkowski', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 

In [36]:
knn_results_o

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'metric': 'minkowski', 'n_neighbors': 1}",0.618889,"[0.6528291087014632, 0.5706176779546345, 0.630...","[0.6256787501577021, 0.5657738095238096, 0.661...","[0.6372624365037659, 0.5668052938216095, 0.645..."
1,"{'metric': 'minkowski', 'n_neighbors': 3}",0.66,"[0.6384042516768778, 0.6231909430438842, 0.729...","[0.7386169168874497, 0.5675, 0.6707291666666666]","[0.6838005006087506, 0.592954651292571, 0.6983..."
2,"{'metric': 'minkowski', 'n_neighbors': 5}",0.68,"[0.6727014784102939, 0.6059030922210371, 0.775...","[0.7527449289989767, 0.5842559523809525, 0.699...","[0.7095715474782983, 0.5942313334629621, 0.735..."
3,"{'metric': 'minkowski', 'n_neighbors': 7}",0.676667,"[0.6640399929846484, 0.6073768573768573, 0.765...","[0.7407794692796165, 0.5581547619047619, 0.730...","[0.6991941022568557, 0.5810153120379857, 0.747..."
4,"{'metric': 'minkowski', 'n_neighbors': 9}",0.69,"[0.6746202302169, 0.6255956495406624, 0.770805...","[0.7254913298849124, 0.5771130952380952, 0.769...","[0.6983783249395534, 0.6001657883179623, 0.769..."
5,"{'metric': 'euclidean', 'n_neighbors': 1}",0.618889,"[0.6528291087014632, 0.5706176779546345, 0.630...","[0.6256787501577021, 0.5657738095238096, 0.661...","[0.6372624365037659, 0.5668052938216095, 0.645..."
6,"{'metric': 'euclidean', 'n_neighbors': 3}",0.66,"[0.6384042516768778, 0.6231909430438842, 0.729...","[0.7386169168874497, 0.5675, 0.6707291666666666]","[0.6838005006087506, 0.592954651292571, 0.6983..."
7,"{'metric': 'euclidean', 'n_neighbors': 5}",0.68,"[0.6727014784102939, 0.6059030922210371, 0.775...","[0.7527449289989767, 0.5842559523809525, 0.699...","[0.7095715474782983, 0.5942313334629621, 0.735..."
8,"{'metric': 'euclidean', 'n_neighbors': 7}",0.676667,"[0.6640399929846484, 0.6073768573768573, 0.765...","[0.7407794692796165, 0.5581547619047619, 0.730...","[0.6991941022568557, 0.5810153120379857, 0.747..."
9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.69,"[0.6746202302169, 0.6255956495406624, 0.770805...","[0.7254913298849124, 0.5771130952380952, 0.769...","[0.6983783249395534, 0.6001657883179623, 0.769..."


In [37]:
knn_r = KNeighborsClassifier()
knn_results_r = test_model(knn_r, params_knn, Xr_train, yr_train, 4)

Testing {'metric': 'minkowski', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 

In [38]:
knn_results_r

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'metric': 'minkowski', 'n_neighbors': 1}",0.612832,"[0.6821273357032176, 0.5231722285164967, 0.583...","[0.7006921390683073, 0.5053246753246754, 0.574...","[0.6904273721933645, 0.5129653310599602, 0.577..."
1,"{'metric': 'minkowski', 'n_neighbors': 3}",0.637168,"[0.6768200270794986, 0.5500853466386555, 0.670...","[0.7802259243555972, 0.4838744588744589, 0.533...","[0.7235498366013073, 0.5136910889909799, 0.586..."
2,"{'metric': 'minkowski', 'n_neighbors': 5}",0.64823,"[0.6878831086391392, 0.5484318555008211, 0.690...","[0.7962882567146585, 0.4645021645021645, 0.583...","[0.7365405048574716, 0.50147115562443, 0.62999..."
3,"{'metric': 'minkowski', 'n_neighbors': 7}",0.662611,"[0.706331511073933, 0.5732108307371904, 0.6790...","[0.8008329800093819, 0.4924675324675324, 0.604...","[0.7486416164116512, 0.5264766791987939, 0.636..."
4,"{'metric': 'minkowski', 'n_neighbors': 9}",0.660398,"[0.6930362400714685, 0.5824703120684893, 0.685...","[0.8053575939089959, 0.46913419913419907, 0.62...","[0.7435230281517913, 0.5139976616587465, 0.648..."
5,"{'metric': 'euclidean', 'n_neighbors': 1}",0.612832,"[0.6821273357032176, 0.5231722285164967, 0.583...","[0.7006921390683073, 0.5053246753246754, 0.574...","[0.6904273721933645, 0.5129653310599602, 0.577..."
6,"{'metric': 'euclidean', 'n_neighbors': 3}",0.637168,"[0.6768200270794986, 0.5500853466386555, 0.670...","[0.7802259243555972, 0.4838744588744589, 0.533...","[0.7235498366013073, 0.5136910889909799, 0.586..."
7,"{'metric': 'euclidean', 'n_neighbors': 5}",0.64823,"[0.6878831086391392, 0.5484318555008211, 0.690...","[0.7962882567146585, 0.4645021645021645, 0.583...","[0.7365405048574716, 0.50147115562443, 0.62999..."
8,"{'metric': 'euclidean', 'n_neighbors': 7}",0.662611,"[0.706331511073933, 0.5732108307371904, 0.6790...","[0.8008329800093819, 0.4924675324675324, 0.604...","[0.7486416164116512, 0.5264766791987939, 0.636..."
9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.660398,"[0.6930362400714685, 0.5824703120684893, 0.685...","[0.8053575939089959, 0.46913419913419907, 0.62...","[0.7435230281517913, 0.5139976616587465, 0.648..."


In [39]:
params_lr = {'penalty' : ('l1', 'l2', 'elasticnet', None),
            'C' : (0.1, 1, 10, 100),
            'solver' : ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'),
            'max_iter' : [500]}

lr = LogisticRegression()
lr_results_o = test_model(lr, params_lr, Xo_train, yo_train, 4)

Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_i

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.96794e-18): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.05928e-16): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.49198e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': None, 'solver': 'newton-cholesky'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': None, 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': None, 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing 

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.96794e-18): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.05928e-16): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.49198e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': None, 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': None, 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.96794e-18): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.05928e-16): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.49198e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': None, 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4




	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cholesky'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elastic

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.96794e-18): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.05928e-16): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.49198e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

In [40]:
lr_results_o

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.723333,"[0.7196672346786471, 0.6655947614280948, 0.794...","[0.7429830242370719, 0.6680654761904762, 0.759...","[0.7306609557443038, 0.6656631526782898, 0.775..."
1,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.716667,"[0.7203072827534791, 0.6478278665778666, 0.798...","[0.726415815074926, 0.6814583333333333, 0.7420...","[0.722596274855273, 0.6624057773773294, 0.7685..."
2,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.716667,"[0.7219857902723501, 0.6471741347071817, 0.798...","[0.7230824817415926, 0.6845833333333333, 0.742...","[0.721609477124183, 0.6633891470477502, 0.7685..."
3,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.721111,"[0.7196672346786471, 0.6609567901234568, 0.792...","[0.7429830242370719, 0.6680654761904762, 0.752...","[0.7306609557443038, 0.6634241224238318, 0.771..."
4,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.716667,"[0.7219857902723501, 0.6471741347071817, 0.798...","[0.7230824817415926, 0.6845833333333333, 0.742...","[0.721609477124183, 0.6633891470477502, 0.7685..."
5,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.72,"[0.7196672346786471, 0.657461326788636, 0.7946...","[0.7429830242370719, 0.6680654761904762, 0.748...","[0.7306609557443038, 0.6613897545928141, 0.770..."
6,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.716667,"[0.7219857902723501, 0.6471741347071817, 0.798...","[0.7230824817415926, 0.6845833333333333, 0.742...","[0.721609477124183, 0.6633891470477502, 0.7685..."
7,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.716667,"[0.7219857902723501, 0.6471741347071817, 0.798...","[0.7230824817415926, 0.6845833333333333, 0.742...","[0.721609477124183, 0.6633891470477502, 0.7685..."
8,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.715556,"[0.7219857902723501, 0.6462637705615234, 0.795...","[0.7230824817415926, 0.6814583333333333, 0.742...","[0.721609477124183, 0.6614305365525543, 0.7671..."
9,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.715556,"[0.7219857902723501, 0.6462637705615234, 0.795...","[0.7230824817415926, 0.6814583333333333, 0.742...","[0.721609477124183, 0.6614305365525543, 0.7671..."


In [41]:
lr_r = LogisticRegression()
lr_results_r = test_model(lr_r, params_lr, Xr_train, yr_train, 4)

Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_i

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=5.86668e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.40618e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=8.07926e-17): result may not be accurate.


Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=5.86668e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.40618e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=8.07926e-17): result may not be accurate.


	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cholesky'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'so

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=5.86668e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.40618e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=8.07926e-17): result may not be accurate.


	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4




	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cholesky'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'liblinear'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'liblinear'}
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'elastic

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=5.86668e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.40618e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=8.07926e-17): result may not be accurate.


In [42]:
lr_results_r 

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.707965,"[0.7347092290096483, 0.6621836677741881, 0.692...","[0.8369808393172878, 0.5098484848484848, 0.718...","[0.7821596102450363, 0.5746571807600137, 0.704..."
1,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.705752,"[0.7347092290096483, 0.6559548337151654, 0.688...","[0.8369808393172878, 0.5098484848484848, 0.706...","[0.7821596102450363, 0.5725212967644085, 0.696..."
2,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.737348021296885, 0.6547321972002433, 0.6903...","[0.8346443907191571, 0.5136363636363637, 0.711...","[0.7826017407189003, 0.5742201712052286, 0.699..."
3,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.7347092290096483, 0.6590994878032157, 0.690...","[0.8369808393172878, 0.5098484848484848, 0.711...","[0.7821596102450363, 0.5736114130434783, 0.699..."
4,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.737348021296885, 0.6547321972002433, 0.6903...","[0.8346443907191571, 0.5136363636363637, 0.711...","[0.7826017407189003, 0.5742201712052286, 0.699..."
5,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.7347092290096483, 0.6590994878032157, 0.690...","[0.8369808393172878, 0.5098484848484848, 0.711...","[0.7821596102450363, 0.5736114130434783, 0.699..."
6,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.737348021296885, 0.6547321972002433, 0.6903...","[0.8346443907191571, 0.5136363636363637, 0.711...","[0.7826017407189003, 0.5742201712052286, 0.699..."
7,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.706858,"[0.737348021296885, 0.6547321972002433, 0.6903...","[0.8346443907191571, 0.5136363636363637, 0.711...","[0.7826017407189003, 0.5742201712052286, 0.699..."
8,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.706858,"[0.7385807457916416, 0.6536569283830391, 0.690...","[0.8324122478620142, 0.5174242424242423, 0.711...","[0.7822208626556189, 0.5757702704115777, 0.699..."
9,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.706858,"[0.7385807457916416, 0.6536569283830391, 0.690...","[0.8324122478620142, 0.5174242424242423, 0.711...","[0.7822208626556189, 0.5757702704115777, 0.699..."
