In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import TextClassificationPipeline

In [2]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfo = dfo.rename(columns={'tweets' : 'text', 'class' : 'label'})
dfr = pd.read_csv('data/romney_cleaned.csv')
dfr = dfr.rename(columns={'tweets' : 'text', 'class' : 'label'})
df = pd.concat([dfo, dfr], ignore_index = True)
df.info()
pd.set_option('display.max_rows', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  object
 1   label   11271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.2+ KB


In [3]:
df = df.astype({'text' : 'string'})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  string
 1   label   11271 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 176.2 KB


# Pre-trained model: BERTweet

Fine-tuning using our data:

In [5]:
X = dfo['text']
y = dfo['label'].map({1 : 2, 0 : 1, -1 : 0})
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size = 0.25)


traindf = pd.concat([X_train,y_train], axis = 1)

evaldf = pd.concat([X_eval,y_eval], axis = 1)

testdf = pd.concat([X_test,y_test], axis = 1)



In [6]:
train = Dataset.from_pandas(traindf, split = 'train')
val = Dataset.from_pandas(evaldf, split = 'eval')
test = Dataset.from_pandas(testdf, split = 'test')


In [7]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    

tokenized_train = train.map(tokenize_function, batched=True)
tokenized_eval = val.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)


Map:   0%|          | 0/3374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="checkpoints/test_trainer_unique", evaluation_strategy="epoch", num_train_epochs=3)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.766345,0.679111
2,0.774800,0.912529,0.695111
3,0.425700,1.171285,0.711111


TrainOutput(global_step=1266, training_loss=0.5343863139220324, metrics={'train_runtime': 491.4506, 'train_samples_per_second': 20.596, 'train_steps_per_second': 2.576, 'total_flos': 665808503560704.0, 'train_loss': 0.5343863139220324, 'epoch': 3.0})

In [11]:
trainer.save_model('models/model_unique')

# Load from checkpoints

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_unique", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer_unique", evaluation_strategy="epoch", num_train_epochs=5)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

# Load finetuned models

In [13]:
model = AutoModelForSequenceClassification.from_pretrained('models/model_unique')

In [14]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [19]:
pos = list()
neg = list()
neu = list()
pred = pd.DataFrame()
for t in testdf['text']:
    prediction = pipe(t, top_k=None)
    for l in prediction:
        if l['label'] == 'POS':
            pos.append(l['score'])
        elif l['label'] == 'NEG':
            neg.append(l['score'])
        else: 
            neu.append(l['score'])
            
pred['pos'] = pos
pred['neg'] = neg
pred['neu'] = neu

In [20]:
pred['class'] = list(y_test.map({0 : -1, 1 : 0, 2 : 1}))

# Predict label using maximum probability


In [25]:
def pred_label(df):
    preds = list()
    for idx, row in df.iterrows():
        if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
            preds.append(1)
        elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
            preds.append(-1)
        elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
            preds.append(0)
    df['pred'] = preds
    acc = accuracy_score(df['class'], df['pred'])
    prec = precision_score(df['class'], df['pred'], average = None, zero_division = np.nan)
    rec = recall_score(df['class'], df['pred'], average = None)
    f1 = f1_score(df['class'], df['pred'], average = None)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    return df

In [27]:
predicted_labels = pred_label(pred)


Accuracy: 0.72
Precision: [0.72817955 0.66240409 0.77777778]
Recall: [0.74489796 0.66925065 0.74855491]
F1: [0.73644388 0.66580977 0.7628866 ]


In [28]:
predicted_labels['pred'].value_counts()

-1    401
 0    391
 1    333
Name: pred, dtype: int64

# Predict label using ML models

In [29]:
def test_model(model, parameters, X, y, n_splits):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state = 27)
    avg_accuracies = list()
    avg_precisions = list()
    avg_recalls = list()
    avg_f1s = list()
    confs = list()
    for conf in ParameterGrid(parameters):
        print('Testing', conf)
        accuracies = list()
        precisions = list()
        recalls = list()
        f1s = list()
        i = 1
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            try:
                model.set_params(**conf)
                model.fit(X_train, y_train)
            except:
                print('Skipped', conf)
                break
            print('\tFold', i, 'of', n_splits)
            y_pred = model.predict(X_test)
            accuracies.append(accuracy_score(y_test, y_pred))
            precisions.append(precision_score(y_test, y_pred, average=None, zero_division = np.nan))
            recalls.append(recall_score(y_test, y_pred, average=None, zero_division = np.nan))
            f1s.append(f1_score(y_test, y_pred, average=None, zero_division = np.nan))
            if i == 1:
                confs.append(conf)
            i = i + 1
            
    
        if len(accuracies) != 0:  
            avg_accuracies.append(sum(accuracies)/len(accuracies))
            avg_precisions.append((sum(precisions)/len(precisions)) if len(precisions) > 0 else np.nan)
            avg_recalls.append(sum(recalls)/len(recalls) if len(recalls) > 0 else np.nan)
            avg_f1s.append(sum(f1s)/len(f1s) if len(f1s) > 0 else np.nan)
        
    results = {'Parameters' : confs,
              'Accuracy' : avg_accuracies,
              'Precision' : avg_precisions,
              'Recall' : avg_recalls,
              'F1' : avg_f1s}
    
    return pd.DataFrame.from_dict(results)
    
        
        

In [30]:
X = predicted_labels[['pos', 'neg', 'neu']]
y = predicted_labels['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


In [31]:
params_svm = {'C' : (0.1, 1, 10, 100),
             'kernel' : ('rbf', 'poly', 'linear'),
             'degree' : (3, 5, 7),
             'gamma' : ('scale', 'auto')}
svm = SVC()
svm_results = test_model(svm, params_svm, X_train, y_train, 4)

Testing {'C': 0.1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 3, 'gamma': 'scale', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 5, 'gamma': 'scale', 'kernel': 'rbf'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 5, 'gamma': 'scale', 'kernel': 'poly'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'degree': 5, 'gamma': '

	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4


In [32]:
svm_results

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'degree': 3, 'gamma': 'scale', 'ker...",0.714444,"[0.7179585058661145, 0.6733597145453516, 0.765...","[0.7389272978050001, 0.6746277371277372, 0.735...","[0.7271664035736014, 0.671676282190197, 0.7495..."
1,"{'C': 0.1, 'degree': 3, 'gamma': 'scale', 'ker...",0.715556,"[0.7346750149075731, 0.6521363162316879, 0.789...","[0.7201313403796232, 0.7209907209907209, 0.707...","[0.7260456492244571, 0.6830583543553068, 0.744..."
2,"{'C': 0.1, 'degree': 3, 'gamma': 'scale', 'ker...",0.716667,"[0.7180053547523427, 0.6805039416262875, 0.763...","[0.7485205780879355, 0.671786828036828, 0.7358...","[0.7321247631202382, 0.6739131685274006, 0.748..."
3,"{'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kern...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."
4,"{'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kern...",0.326667,"[nan, nan, nan]","[0.25, 0.75, 0.0]","[nan, nan, nan]"
5,"{'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kern...",0.716667,"[0.7180053547523427, 0.6805039416262875, 0.763...","[0.7485205780879355, 0.671786828036828, 0.7358...","[0.7321247631202382, 0.6739131685274006, 0.748..."
6,"{'C': 0.1, 'degree': 5, 'gamma': 'scale', 'ker...",0.714444,"[0.7179585058661145, 0.6733597145453516, 0.765...","[0.7389272978050001, 0.6746277371277372, 0.735...","[0.7271664035736014, 0.671676282190197, 0.7495..."
7,"{'C': 0.1, 'degree': 5, 'gamma': 'scale', 'ker...",0.716667,"[0.752666531409674, 0.6399592182595338, 0.7950...","[0.7139683034770219, 0.736693767943768, 0.7000...","[0.7318756332873039, 0.6837515185856076, 0.743..."
8,"{'C': 0.1, 'degree': 5, 'gamma': 'scale', 'ker...",0.716667,"[0.7180053547523427, 0.6805039416262875, 0.763...","[0.7485205780879355, 0.671786828036828, 0.7358...","[0.7321247631202382, 0.6739131685274006, 0.748..."
9,"{'C': 0.1, 'degree': 5, 'gamma': 'auto', 'kern...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."


In [33]:
params_rf = {'n_estimators' : (50, 100, 150),
            'criterion' : ('entropy', 'gini'),
            'max_features' : (None, 'sqrt')}
rf = RandomForestClassifier()
rf_results = test_model(rf, params_rf, X_train, y_train, 4)

Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': None, 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 150}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 50}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'criterion': 'gini', 'max_features': None, 'n_estimators': 100}
	Fold 1 of 4
	Fold 2 of 4
	Fold

In [34]:
rf_results

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'criterion': 'entropy', 'max_features': None,...",0.661111,"[0.6838281886783727, 0.6165138265490377, 0.693...","[0.7284640846567952, 0.589307933057933, 0.6745...","[0.7040719163946114, 0.6002426536879591, 0.680..."
1,"{'criterion': 'entropy', 'max_features': None,...",0.667778,"[0.6910624098124097, 0.6222856401451216, 0.700...","[0.7214786811739418, 0.5982365044865046, 0.692...","[0.7041716316921351, 0.6076235736229976, 0.692..."
2,"{'criterion': 'entropy', 'max_features': None,...",0.662222,"[0.679763515886265, 0.6121859215492871, 0.7088...","[0.7198244936620503, 0.5912471537471538, 0.682...","[0.6976921315102591, 0.5994772067075989, 0.690..."
3,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.672222,"[0.6879324702854114, 0.6326623305839177, 0.710...","[0.7362782607056539, 0.6114035176535175, 0.675...","[0.7105386851985175, 0.6194516457026523, 0.688..."
4,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.666667,"[0.6855041311113529, 0.6214546523855244, 0.703...","[0.7266691274487801, 0.6048022923022922, 0.674...","[0.7043238026918084, 0.610851062610765, 0.6849..."
5,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.66,"[0.6743567406882431, 0.6113338728633043, 0.711...","[0.7106123621682214, 0.5983504108504109, 0.678...","[0.6913800354757844, 0.6017826920916476, 0.690..."
6,"{'criterion': 'gini', 'max_features': None, 'n...",0.666667,"[0.681336781233461, 0.6148702249406395, 0.7206...","[0.7266849804227186, 0.6109976734976735, 0.667...","[0.7024828698222143, 0.6110904963750492, 0.690..."
7,"{'criterion': 'gini', 'max_features': None, 'n...",0.662222,"[0.6840277777777778, 0.6099355128276682, 0.699...","[0.7190210284213707, 0.5901128713628714, 0.685...","[0.6990478145621759, 0.5981789890942335, 0.690..."
8,"{'criterion': 'gini', 'max_features': None, 'n...",0.657778,"[0.6902934419381788, 0.6082963237587594, 0.686...","[0.7120261705871299, 0.594452281952282, 0.6749...","[0.6992935829116473, 0.5987409565363686, 0.677..."
9,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.66,"[0.6614749189149344, 0.6163081210161594, 0.731...","[0.7033801736066319, 0.6137902075402075, 0.671...","[0.6803905584500443, 0.610969387755102, 0.6942..."


In [35]:
params_knn = {'n_neighbors' : (1, 3, 5, 7, 9),
             'metric' : ('minkowski', 'euclidean', 'manhattan', 'cosine')}
knn = KNeighborsClassifier()
knn_results = test_model(knn, params_knn, X_train, y_train, 4)

Testing {'metric': 'minkowski', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'minkowski', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 1}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 3}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 5}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 7}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'metric': 'euclidean', 'n_neighbors': 9}
	Fold 1 of 4
	Fold 2 of 4
	Fold 

In [36]:
knn_results

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'metric': 'minkowski', 'n_neighbors': 1}",0.622222,"[0.626640518214774, 0.598409029330082, 0.64720...","[0.6162736778487197, 0.5855237105237105, 0.678...","[0.6201660022105943, 0.5905291005291006, 0.660..."
1,"{'metric': 'minkowski', 'n_neighbors': 3}",0.653333,"[0.6120287766399402, 0.6249589666802245, 0.757...","[0.7346176672080094, 0.5729951354951355, 0.659...","[0.6667784836095481, 0.5945604693906067, 0.702..."
2,"{'metric': 'minkowski', 'n_neighbors': 5}",0.681111,"[0.6761717223865602, 0.6259078237793224, 0.781...","[0.7438456516757764, 0.6458476770976771, 0.656...","[0.7068516236773519, 0.6326279729172442, 0.709..."
3,"{'metric': 'minkowski', 'n_neighbors': 7}",0.687778,"[0.6904639481042532, 0.6258954678362573, 0.786...","[0.7593521354169428, 0.6352166977166978, 0.671...","[0.7224861551491986, 0.6273474061067463, 0.719..."
4,"{'metric': 'minkowski', 'n_neighbors': 9}",0.697778,"[0.6837064559140968, 0.6480453850789961, 0.807...","[0.7636236540803036, 0.6588830651330652, 0.674...","[0.7203583038737347, 0.6502383898280313, 0.730..."
5,"{'metric': 'euclidean', 'n_neighbors': 1}",0.622222,"[0.626640518214774, 0.598409029330082, 0.64720...","[0.6162736778487197, 0.5855237105237105, 0.678...","[0.6201660022105943, 0.5905291005291006, 0.660..."
6,"{'metric': 'euclidean', 'n_neighbors': 3}",0.653333,"[0.6120287766399402, 0.6249589666802245, 0.757...","[0.7346176672080094, 0.5729951354951355, 0.659...","[0.6667784836095481, 0.5945604693906067, 0.702..."
7,"{'metric': 'euclidean', 'n_neighbors': 5}",0.681111,"[0.6761717223865602, 0.6259078237793224, 0.781...","[0.7438456516757764, 0.6458476770976771, 0.656...","[0.7068516236773519, 0.6326279729172442, 0.709..."
8,"{'metric': 'euclidean', 'n_neighbors': 7}",0.687778,"[0.6904639481042532, 0.6258954678362573, 0.786...","[0.7593521354169428, 0.6352166977166978, 0.671...","[0.7224861551491986, 0.6273474061067463, 0.719..."
9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.697778,"[0.6837064559140968, 0.6480453850789961, 0.807...","[0.7636236540803036, 0.6588830651330652, 0.674...","[0.7203583038737347, 0.6502383898280313, 0.730..."


In [37]:
params_lr = {'penalty' : ('l1', 'l2', 'elasticnet', None),
            'C' : (0.1, 1, 10, 100),
            'solver' : ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'),
            'max_iter' : [500]}

lr = LogisticRegression()
lr_results = test_model(lr, params_lr, X_train, y_train, 4)

Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 0.1, 'max_i

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.7129e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedi

	Fold 4 of 4
Testing {'C': 0.1, 'max_iter': 500, 'penalty': None, 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'lbfgs'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cg'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'newton-cholesky'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Skipped {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'sag'}
Testing {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 1, 'max_iter': 500,

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.7129e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedi

	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'liblinear'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'liblinear'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'newton-cholesky'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'newton-cholesky'}
Testing {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'sag'}
Skipped {'C': 10, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'sag'}
Testing {'C': 10,

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.7129e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedi

	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cholesky'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
	Fold 1 of 4
	Fold 2 of 4
	Fold 3 of 4
	Fold 4 of 4
Testing {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Skipped {'C': 100, 'max_iter': 500, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
Testing {'C': 100, 'max_iter'

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.7129e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.13088e-17): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedi

In [38]:
lr_results

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1
0,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.717778,"[0.7214995937685735, 0.6821010866059414, 0.761...","[0.7523667319340893, 0.6685816998316998, 0.739...","[0.735625265885447, 0.6725306586849574, 0.7489..."
1,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l1', '...",0.716667,"[0.7214995937685735, 0.6797765252024326, 0.760...","[0.7523667319340893, 0.6685816998316998, 0.735...","[0.735625265885447, 0.6715393374220515, 0.7468..."
2,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."
3,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.717778,"[0.7214995937685735, 0.6821010866059414, 0.761...","[0.7523667319340893, 0.6685816998316998, 0.739...","[0.735625265885447, 0.6725306586849574, 0.7489..."
4,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."
5,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.717778,"[0.7214995937685735, 0.6821010866059414, 0.761...","[0.7523667319340893, 0.6685816998316998, 0.739...","[0.735625265885447, 0.6725306586849574, 0.7489..."
6,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."
7,"{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '...",0.715556,"[0.7203929539295393, 0.6775123420171967, 0.760...","[0.7485205780879355, 0.6685816998316998, 0.735...","[0.7332559848396953, 0.6705600321744536, 0.746..."
8,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.716667,"[0.7203929539295393, 0.6797765252024326, 0.761...","[0.7485205780879355, 0.6685816998316998, 0.739...","[0.7332559848396953, 0.6715393374220515, 0.748..."
9,"{'C': 0.1, 'max_iter': 500, 'penalty': None, '...",0.716667,"[0.7203929539295393, 0.6797765252024326, 0.761...","[0.7485205780879355, 0.6685816998316998, 0.739...","[0.7332559848396953, 0.6715393374220515, 0.748..."
