# Test Logistic Regression model

**Trained on**: SemEval (2019)
**Tested on**: Waseem and Hovy (2016)

First we need to install the required packages.

In [10]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from google.colab import drive

# Mount drive for loading the data
drive.mount('/content/drive')

import sys
sys.path.append("drive/MyDrive/Colab Notebooks")

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from google.colab import drive
import preprocessor as p
import html
import pickle
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np
from reader import Reader


MAX_N_GRAM = 4
WASEEM_FILENAME = "drive/MyDrive/Colab Notebooks/data/twitter_data.pkl"
SEMEVAL_FILENAME = "drive/MyDrive/Colab Notebooks/data/sem_eval_all.pkl"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Split and tokenize the datasets

In [12]:
def preprocess(data):
    """Preprocesses the data by cleaning URLs and MENTIONS since these do not
    contain any valuable information."""
    p.set_options(p.OPT.URL, p.OPT.MENTION)

    return list(map(lambda text: p.clean(html.unescape(text)), data))


In [13]:
reader = Reader(filename=SEMEVAL_FILENAME)
X, y = reader.load()

mapping = {'hate': 1, 'none': 0}
y = [mapping[b] for b in y]
X = preprocess(X)

# Split dataset into train and test
X_train, _, y_train, _ = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [14]:
vectorizer = CountVectorizer(ngram_range=(1, MAX_N_GRAM), analyzer="char")
vectorizer.fit(X_train)

CountVectorizer(analyzer='char', ngram_range=(1, 4))

In [15]:
reader = Reader(filename=WASEEM_FILENAME)
X, y = reader.load()

mapping = {'racism': 1, 'sexism': 1, 'none': 0}
y = [mapping[b] for b in y]
X = preprocess(X)

# Split dataset into train and test
_, X_test, _, y_test = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [16]:
X_test_transformed = vectorizer.transform(X_test)

## Load existing model

In [17]:
model = load('/content/drive/MyDrive/Colab Notebooks/output/logistic-regression-semeval.joblib') 

## Run model on the test dataset

In [18]:
probabilities = model.predict_proba(X_test_transformed)
predictions = np.argmax(probabilities, axis=1)
print(metrics.accuracy_score(y_test, predictions))

0.692355500310752


In [19]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.70      0.98      0.81      1104
           1       0.58      0.07      0.13       505

    accuracy                           0.69      1609
   macro avg       0.64      0.52      0.47      1609
weighted avg       0.66      0.69      0.60      1609



## Export predictions

In [20]:
def to_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=1)
    return list(map(lambda predicted_values, predicted_class, actual_class:
                    {'predicted_class': class_name(predicted_class),
                     'actual_class': class_name(actual_class),
                     'predicted_value': predicted_values[predicted_class].item(),
                     'text': None},
                    predictions, predicted_classes, labels))
def class_name(index):
    if index == 0:
        return "None"
    elif index == 1:
        return "Hate"

predictions_info = to_predictions(probabilities, y_test)
pickle.dump(predictions_info, open("drive/MyDrive/Colab Notebooks/output/lr-semeval-waseem.p", "wb"))