# Test Logistic Regression model (train: Waseem and Hovy 2016, test: Waseem and Hovy 2016)

First we need to install the required packages.

In [None]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from google.colab import drive
import preprocessor as p
import html
import pickle
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np

# Mount drive for loading the datasets
drive.mount('/content/drive')
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/')

MAX_N_GRAM = 4
FILENAME = "drive/MyDrive/Colab Notebooks/data/twitter_data.pkl"

Mounted at /content/drive


## Split and tokenize the datasets

In [None]:
def preprocess(data):
    """Preprocesses the data by cleaning URLs and MENTIONS since these do not
    contain any valuable information."""
    p.set_options(p.OPT.URL, p.OPT.MENTION)

    return list(map(lambda text: p.clean(html.unescape(text)), data))

def convert_waseem_data(data):
    """Converts the Waseem and Hovy dataset by creating binary labels"""
    X = []
    y = []
    for i in range(len(data)):
        X.append(data[i]['text'])
        label = data[i]['label']
        if label in ['racism', 'sexism']:
          label = 1
        else:
          label = 0
        y.append(label)

    return X, y

In [None]:
data = pickle.load(open(FILENAME, 'rb'))
X, y = convert_waseem_data(data)
X = preprocess(X)

# Split dataset into train, test, and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, MAX_N_GRAM), analyzer="char")
vectorizer.fit(X_train)
X_test_transformed = vectorizer.transform(X_test)

## Load existing model

In [None]:
model = load('/content/drive/MyDrive/Colab Notebooks/output/logistic-regression-waseem.joblib') 

## Run model on the test dataset

In [None]:
probabilities = model.predict_proba(X_test_transformed)
predictions = np.argmax(probabilities, axis=1)
print(metrics.accuracy_score(y_test, predictions))

0.853325046612803


In [None]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.92      0.90      1104
           1       0.81      0.70      0.75       505

    accuracy                           0.85      1609
   macro avg       0.84      0.81      0.82      1609
weighted avg       0.85      0.85      0.85      1609



## Export predictions

In [None]:
def to_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=1)
    return list(map(lambda predicted_values, predicted_class, actual_class:
                    {'predicted_class': class_name(predicted_class),
                     'actual_class': class_name(actual_class),
                     'predicted_value': predicted_values[predicted_class].item(),
                     'text': None},
                    predictions, predicted_classes, labels))
def class_name(index):
    if index == 0:
        return "None"
    elif index == 1:
        return "Hate"

predictions_info = to_predictions(probabilities, y_test)
pickle.dump(predictions_info, open("drive/MyDrive/Colab Notebooks/output/lr-waseem-waseem.p", "wb"))