In [1]:
%load_ext blackcellmagic

In [2]:
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import joblib 

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

mpl.rcParams["figure.dpi"] = 300
%matplotlib inline
%config InlineBackend.figure_format ='retina'

In [3]:
raw = pd.read_csv("../data/train_raw.csv")

val = pd.read_csv("../data/validate.csv").dropna(subset=["clean_text"])

In [4]:
X = raw["clean_text"]
y = raw["troll_or_not"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2, shuffle=True, stratify=y
)

In [5]:
print(X_train.shape)
print(X_test.shape)

(71958,)
(17990,)


In [6]:
print(y_train.shape)
print(y_test.shape)

(71958,)
(17990,)


# 2. DUMMY BASELINE

A baseline model helps us gauge what's the minimum level of accuracy the model must achieve in order to be acceptable. In this case, it is 0.44.

In [7]:
dummy = DummyClassifier(strategy='stratified', random_state=42)

In [8]:
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, dummy_pred))

Accuracy Score: 0.5013896609227348


# 3. PIPELINE CONSTRUCTION

Scikit Learn's pipeline feature streamlines the process of tuning a model's hyperparameters, an essential step for a tree-based model like XGBoost. Other classic machine learning models were tested, but did not perform better than XGB. In the interests of time, this is the only version included here.

Elements of this pipeline:
* CountVectorizer + TfidfTransformer; the classic combination for a [Bag-of-Words approach](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

* Smote, or Synthetic Minority Over-sampling Technique, is used to address the imbalanced dataset

* XGBClassifier. Due to the use of Smote, I did not use the OneVsRest approach, which is often the default for approaching such multi-class text classification problems.

## 3.1 PIPELINE + PARAMETERS TUNING

In [9]:
model = Pipeline(
    [
        ("vect", CountVectorizer(stop_words="english")),
        ("tfidf", TfidfTransformer(norm="l2", use_idf=True)),
        (
            "clf",
            XGBClassifier(
                base_score=0.5,
                objective="binary:logistic",
                n_estimators=1000,
                reg_lambda=1,
                n_jobs=-1,
                verbosity=1,
            ),
        ),
    ]
)

parameters = {
    "vect__max_df": [i / 100 for i in range(1, 5)],
    "vect__min_df": [i / 100 for i in range(1, 5)],
    "clf__learning_rate": [0.0001, 0.001, 0.01, 0.1],
    "clf__seed": [16, 42, 66],
    "clf__max_depth": [i for i in range(5, 25)],
    "clf__reg_lambda": [i for i in range(1, 10)],
    "clf__gamma": [i / 10.0 for i in range(1, 10)],
    "clf__reg_alpha": [1e-5, 1e-2, 0.1, 1, 10],
}


In [10]:
# RandomizedSearchCV used instead of GridSearchCV due to the number of parameters being tuned
# RandomizedSearchCV completes in a far shorter time, with a slight dent on performance
# See https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html

gs_clf_xgb = RandomizedSearchCV(
    model,
    parameters,
    cv=10,
    n_jobs=-1,
    n_iter=10,
    random_state=42,
    scoring="accuracy",
    verbose=1,
    return_train_score=True,
)

gs_clf_xgb = gs_clf_xgb.fit(X_train, y_train)

print("Best Score:", gs_clf_xgb.best_score_)
print("Best Parameters:", gs_clf_xgb.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
stop

## 3.2 PICKLING THE OPTIMISED MODEL

This is essential for future steps involving deployment and other functions - avoids having to re-run the model again.

In [None]:
#joblib.dump(gs_clf_xgb.best_estimator_, "../pkl/xgb_rdcv.pkl", compress=1)

In [None]:
#joblib.dump(gs_clf_xgb.cv_results_, "../pkl/xgb_results.pkl", compress=1)

## 3.3 LOADING OPTIMISED MODEL

In [None]:
XGB = joblib.load("../pkl/xgb_rdcv.pkl")

In [None]:
# Showing the finalised model the full training data-set, ahead of testing it on unseen data
# This takes a long time to load on a laptop due to the file size

XGB.fit(X, y)

# 4. PITTING MODEL AGAINST VALIDATION SETS

Let's see how the model performs against the unseen validation sets, which had been kept aside.

In [None]:
# Loading up validation set

X_validate = val['clean_text']
y_validate = val['troll_or_not']

In [None]:
pred_validate = LR.predict(X_validate)


In [None]:
cm_validate = confusion_matrix(y_validate, pred_validate)
cm_validate

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True, fmt="d", cmap="gist_gray_r", annot_kws={"size": 20})
plt.title("Predictions v Actual", fontsize=20)
plt.ylabel("Actual ", fontsize=20)
plt.xlabel("Predicted ", fontsize=20)
plt.tight_layout()

# 5. EXAMPLES OF WHERE PREDICTIONS WENT RIGHT/WRONG

In [None]:
text_labels = XGB.classes_
text_labels

In [None]:
test_tags = val['troll_or_not']
test_posts = val['clean_text']

In [None]:
for i in range(len(test_tags)):
    prediction = XGB.predict(np.array([test_posts[i]]))        
    predicted_label = prediction
    print(test_posts.iloc[i], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)
    print("#####################################")