In [1]:
%load_ext blackcellmagic

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer

mpl.rcParams["figure.dpi"] = 300
%matplotlib inline
%config InlineBackend.figure_format ='retina'

In [3]:
raw = pd.read_csv("../data/train_raw.csv")

val = pd.read_csv("../data/validate.csv").dropna(subset=["clean_text"])


In [4]:
X = raw["clean_text"]
y = raw["troll_or_not"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2, shuffle=True, stratify=y
)


In [5]:
print(X_train.shape)
print(X_test.shape)

(71958,)
(17990,)


In [6]:
print(y_train.shape)
print(y_test.shape)

(71958,)
(17990,)


# 1. DUMMY BASELINE

In [7]:
dummy = DummyClassifier(strategy='stratified', random_state=42)

In [8]:
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, dummy_pred))

Accuracy Score: 0.5013896609227348


# 2. GRID SEARCH FOR OPTIMAL PARAMETERS

Due to the size of the dataset, I kept the number of parameters for grid search down to max and min df. The parameters for the model had been consistent from earlier tests, so I kept them here.

In [9]:
model = Pipeline(
    [
        ("vect", CountVectorizer(stop_words="english")),
        ("tfidf", TfidfTransformer(use_idf=True, norm="l2")),
        (
            "clf",
            LogisticRegression(
                class_weight="balanced", penalty="l2", max_iter=500, n_jobs=-1
            ),
        ),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "vect__max_df": (0.2, 0.5, 0.75, 1.0),
    "vect__min_df": (10, 15, 20, 25),
    "clf__random_state": (16, 42, 66),
    "clf__C": np.logspace(1, 5, 20),
    "clf__solver": ["liblinear", "sag", "saga", "newton-cg"],
}


In [14]:
%%time
gs_lr = GridSearchCV(
    model,
    parameters,
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    return_train_score=True,
    verbose=1,
)

gs_lr.fit(X_train, y_train)

Fitting 5 folds for each of 7680 candidates, totalling 38400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
print(gs_lr.best_score_)
print(gs_lr.best_params_)

In [11]:
stop

NameError: name 'stop' is not defined

In [None]:
#joblib.dump(gs_lr.best_estimator_, "../pkl/lr.pkl", compress=1)

In [None]:
#joblib.dump(gs_lr.cv_results_, "../pkl/lr_results.pkl", compress=1)

# 3. LOADING OPTIMISED MODEL

In [None]:
LR = joblib.load("../pkl/lr.pkl")

In [None]:
%%time

LR.fit(X, y)

In [None]:
# Loading up validation set

X_validate = val['clean_text']
y_validate = val['troll_or_not']

In [None]:
pred_validate = LR.predict(X_validate)

In [None]:
cm_validate = confusion_matrix(y_validate, pred_validate)
cm_validate

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True, fmt="d", cmap="gist_gray_r", annot_kws={"size": 20})
plt.title("Predictions v Actual", fontsize=20)
plt.ylabel("Actual ", fontsize=20)
plt.xlabel("Predicted ", fontsize=20)
plt.tight_layout()

# NOTE: 
Slight dip in accuracy score. But this is not a good metric to begin with for imbalanced multi-class classification.

Let's look at the precision and recall scores.

# 5. EXAMPLES OF WHERE PREDICTIONS WENT RIGHT/WRONG

In [None]:
text_labels = LR.classes_
text_labels

In [None]:
test_tags = val['troll_or_not']
test_posts = val['clean_text']

In [None]:
for i in range(100,120):
    prediction = LR.predict(np.array([X_validate[i]]))        
    predicted_label = prediction
    print(test_posts.iloc[i][:100], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)
    print("#####################################")