# Test Logistic Regression model

**Trained on**: SemEval (2019)
**Tested on**: SemEval (2019)

First we need to install the required packages.

In [None]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
from google.colab import drive

# Mount drive for loading the data
drive.mount('/content/drive')

import sys
sys.path.append("drive/MyDrive/Colab Notebooks")

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from google.colab import drive
import preprocessor as p
import html
import pickle
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np
from reader import Reader

MAX_N_GRAM = 4
FILENAME = "drive/MyDrive/Colab Notebooks/data/sem_eval_all.pkl"

Mounted at /content/drive


## Split and tokenize the datasets

In [None]:
def preprocess(data):
    """Preprocesses the data by cleaning URLs and MENTIONS since these do not
    contain any valuable information."""
    p.set_options(p.OPT.URL, p.OPT.MENTION)

    return list(map(lambda text: p.clean(html.unescape(text)), data))

In [None]:
reader = Reader(filename=FILENAME)
X, y = reader.load()

mapping = {'hate': 1, 'none': 0}
y = [mapping[b] for b in y]

X = preprocess(X)

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=y, test_size=0.10)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, MAX_N_GRAM), analyzer="char")
vectorizer.fit(X_train)
X_test_transformed = vectorizer.transform(X_test)

## Load existing model

In [None]:
model = load('/content/drive/MyDrive/Colab Notebooks/output/logistic-regression-semeval.joblib') 

## Run model on the test dataset

In [None]:
probabilities = model.predict_proba(X_test_transformed)
predictions = np.argmax(probabilities, axis=1)
print(metrics.accuracy_score(y_test, predictions))

0.7341666666666666


In [None]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       696
           1       0.71      0.63      0.67       504

    accuracy                           0.73      1200
   macro avg       0.73      0.72      0.72      1200
weighted avg       0.73      0.73      0.73      1200



## Export predictions

In [None]:
def to_predictions(predictions, labels):
    predicted_classes = np.argmax(predictions, axis=1)
    return list(map(lambda predicted_values, predicted_class, actual_class:
                    {'predicted_class': class_name(predicted_class),
                     'actual_class': class_name(actual_class),
                     'predicted_value': predicted_values[predicted_class].item(),
                     'text': None},
                    predictions, predicted_classes, labels))
def class_name(index):
    if index == 0:
        return "None"
    elif index == 1:
        return "Hate"

predictions_info = to_predictions(probabilities, y_test)
pickle.dump(predictions_info, open("drive/MyDrive/Colab Notebooks/output/lr-semeval-semeval.p", "wb"))