# Test Logistic Regression model

**Trained on**: Waseem and Hovy (2016)
**Tested on**: SemEval (2019)

First we need to install the required packages.

In [None]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from google.colab import drive
import preprocessor as p
import html
import pickle
from reader import Reader
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np

# Mount drive for loading the datasets
drive.mount('/content/drive')
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/')

MAX_N_GRAM = 4
WASEEM_FILENAME = "drive/MyDrive/Colab Notebooks/data/twitter_data.pkl"
SEMEVAL_FILENAME = "drive/MyDrive/Colab Notebooks/data/sem_eval_all.pkl"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Split and tokenize the datasets

In [None]:
def convert_semeval_data(data):
    """Converts the SemEval dataset by creating binary labels"""
    X = []
    y = []
    for i in range(len(data)):
        X.append(data[i]['text'])
        label = data[i]['label']
        if label == 'hate':
          label = 1
        else:
          label = 0
        y.append(label)

    return X, y

In [None]:
X, y = Reader.load(WASEEM_FILENAME)
X = Reader.preprocess(X)

mapping = {'racism': 1,'sexism': 1, 'none': 0}
y = [mapping[b] for b in y]

# Split dataset into train, test, and validation
X_train, _, y_train, _ = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, MAX_N_GRAM), analyzer="char")
vectorizer.fit(X_train)

CountVectorizer(analyzer='char', ngram_range=(1, 4))

In [None]:
X, y = Reader.load(SEMEVAL_FILENAME)
X = Reader.preprocess(X)

mapping = {'hate': 1, 'none': 0}
y = [mapping[b] for b in y]

# Split dataset into train, test, and validation
_, X_test, _, y_test = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [None]:
X_test_transformed = vectorizer.transform(X_test)

## Load existing model

In [None]:
model = load('/content/drive/MyDrive/Colab Notebooks/output/logistic-regression-waseem.joblib') 

## Run model on the test dataset

In [None]:
probabilities = model.predict_proba(X_test_transformed)
predictions = np.argmax(probabilities, axis=1)
print(metrics.accuracy_score(y_test, predictions))

0.6308333333333334


In [None]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.71      0.69       696
           1       0.57      0.52      0.54       504

    accuracy                           0.63      1200
   macro avg       0.62      0.62      0.62      1200
weighted avg       0.63      0.63      0.63      1200



## Export predictions

In [None]:
def to_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=1)
    return list(map(lambda predicted_values, predicted_class, actual_class:
                    {'predicted_class': class_name(predicted_class),
                     'actual_class': class_name(actual_class),
                     'predicted_value': predicted_values[predicted_class].item(),
                     'text': None},
                    predictions, predicted_classes, labels))
def class_name(index):
    if index == 0:
        return "None"
    elif index == 1:
        return "Hate"

predictions_info = to_predictions(probabilities, y_test)
pickle.dump(predictions_info, open("drive/MyDrive/Colab Notebooks/output/lr-waseem-semeval.p", "wb"))