# Test Logistic Regression model

**Trained on**: Waseem and Hovy (2016)
**Tested on**: SemEval (2019)

First we need to install the required packages.

In [1]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [2]:
!pip install wordsegment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[K     |████████████████████████████████| 4.8 MB 12.6 MB/s 
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1


In [3]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from google.colab import drive
import preprocessor as p
import html
import pickle
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np

# Mount drive for loading the datasets
drive.mount('/content/drive')
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/')

from reader import Reader

MAX_N_GRAM = 4
WASEEM_FILENAME = "drive/MyDrive/Colab Notebooks/data/twitter_data.pkl"
SEMEVAL_FILENAME = "drive/MyDrive/Colab Notebooks/data/sem_eval_all.pkl"

Mounted at /content/drive


## Split and tokenize the datasets

In [4]:
def convert_semeval_data(data):
    """Converts the SemEval dataset by creating binary labels"""
    X = []
    y = []
    for i in range(len(data)):
        X.append(data[i]['text'])
        label = data[i]['label']
        if label == 'hate':
          label = 1
        else:
          label = 0
        y.append(label)

    return X, y

In [5]:
X, y = Reader.load(WASEEM_FILENAME)
X = Reader.preprocess(X)

mapping = {'racism': 1,'sexism': 1, 'none': 0}
y = [mapping[b] for b in y]

X_train, X_test, y_train, y_test = Reader.split(X, y)


In [6]:
vectorizer = CountVectorizer(ngram_range=(1, MAX_N_GRAM), analyzer="char")
vectorizer.fit(X_train)

CountVectorizer(analyzer='char', ngram_range=(1, 4))

In [7]:
X, y = Reader.load(SEMEVAL_FILENAME)
X = Reader.preprocess(X)

mapping = {'hate': 1, 'none': 0}
y = [mapping[b] for b in y]

X_train, X_test, y_train, y_test = Reader.split(X, y)

In [8]:
X_test_transformed = vectorizer.transform(X_test)

## Load existing model

In [9]:
model = load('/content/drive/MyDrive/Colab Notebooks/output/logistic-regression-waseem.joblib') 

## Run model on the test dataset

In [10]:
probabilities = model.predict_proba(X_test_transformed)
predictions = np.argmax(probabilities, axis=1)
print(metrics.accuracy_score(y_test, predictions))

0.6404166666666666


In [11]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.76      0.71      1391
           1       0.59      0.48      0.53      1009

    accuracy                           0.64      2400
   macro avg       0.63      0.62      0.62      2400
weighted avg       0.63      0.64      0.63      2400



## Export predictions

In [12]:
def to_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=1)
    return list(map(lambda predicted_values, predicted_class, actual_class:
                    {'predicted_class': class_name(predicted_class),
                     'actual_class': class_name(actual_class),
                     'predicted_value': predicted_values[predicted_class].item(),
                     'text': None},
                    predictions, predicted_classes, labels))
def class_name(index):
    if index == 0:
        return "None"
    elif index == 1:
        return "Hate"

predictions_info = to_predictions(probabilities, y_test)
pickle.dump(predictions_info, open("drive/MyDrive/Colab Notebooks/output/lr-waseem-semeval.p", "wb"))