In [None]:
import pickle as pk
import pandas as pd
import matplotlib.pyplot as plt
import pandas.plotting as pdplt
import numpy as np
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
DATASET_PATH = "datasets/ruddit_with_text.csv"
SVM_STATS_PATH = "stats/svm_stats.png"
NB_STATS_PATH = "stats/nb_stats.png"
STOPWORDS_LANGUAGE = "english"
SVM_MODEL_PATH = "models/svm_model.pickle"
NB_MODEL_PATH = "models/nb_model.pickle"
THRESHOLD_QUANTILE = 0.5
SEED = 1928

# Data setup

In [None]:
UNUSED_COLUMNS = ["post_id", "comment_id", "url"]
SCORE_COLUMN = "offensiveness_score"
COMMENT_COLUMN = "txt"
OFFENSIVE_LABEL = "offensive"
NOT_OFFENSIVE_LABEL = "not_offensive"

In [None]:
dataset = pd.read_csv(DATASET_PATH)
dataset.head()

In [None]:
# drops unused columns
dataset.drop(UNUSED_COLUMNS, axis="columns", inplace=True)

In [None]:
# drops deleted comments
dataset[COMMENT_COLUMN].replace("[deleted]", np.nan, inplace=True)
dataset.dropna(subset=[COMMENT_COLUMN], inplace=True)
dataset.head()

In [None]:
middle = np.quantile(dataset[SCORE_COLUMN], q=THRESHOLD_QUANTILE)
print("middle:", middle)

In [None]:
plt.hist(dataset[SCORE_COLUMN], bins="auto")
plt.axvline(middle, color="k")
_ = plt.title("Offensiveness Score Distribution")

# Training

In [None]:
from sklearn.preprocessing import binarize

x = dataset[COMMENT_COLUMN]
# y = binarize(dataset[SCORE_COLUMN].to_numpy().reshape(-1, 1)).ravel()
y = dataset[SCORE_COLUMN].map(lambda s: OFFENSIVE_LABEL if s > 0 else NOT_OFFENSIVE_LABEL)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=SEED)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.preprocessing import StandardScaler

svm_classifier = make_pipeline(CountVectorizer(stop_words=STOPWORDS_LANGUAGE), SVC())
svm_classifier.fit(x_train, y_train)

In [None]:
nb_classifier = make_pipeline(CountVectorizer(stop_words=STOPWORDS_LANGUAGE), MultinomialNB())
nb_classifier.fit(x_train, y_train)

# Results

In [None]:
classes = nb_classifier.classes_
y_pred = nb_classifier.predict(x_test)
nb_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
nb_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
nb_confusion.set_axis(classes, axis="rows", inplace=True)
nb_confusion.set_axis(classes, axis="columns", inplace=True)

In [None]:
svm_classes = svm_classifier.classes_
y_pred = svm_classifier.predict(x_test)
svm_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
svm_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
svm_confusion.set_axis(svm_classes, axis="rows", inplace=True)
svm_confusion.set_axis(svm_classes, axis="columns", inplace=True)

In [None]:
fig, [report_axis, confusion_axis] = plt.subplots(2, 1)
report_axis.axis("tight")
report_axis.axis("off")
pdplt.table(report_axis, svm_report, loc="center")
report_axis.set_title("Estastisticas do Modelo SVM")
confusion_axis.axis("tight")
confusion_axis.axis("off")
pdplt.table(confusion_axis, svm_confusion, loc="center")
confusion_axis.set_title("Matriz de Confusão")
fig.tight_layout()
fig.savefig(SVM_STATS_PATH, bbox_inches="tight", dpi=240)

In [None]:
fig, [report_axis, confusion_axis] = plt.subplots(2, 1)
report_axis.axis("tight")
report_axis.axis("off")
pdplt.table(report_axis, nb_report, loc="center")
report_axis.set_title("Estastisticas do Modelo NB")
confusion_axis.axis("tight")
confusion_axis.axis("off")
pdplt.table(confusion_axis, nb_confusion, loc="center")
confusion_axis.set_title("Matriz de Confusão")
fig.tight_layout()
fig.savefig(NB_STATS_PATH, bbox_inches="tight", dpi=240)

In [None]:
comment = "welcome to narnia"
prediction = nb_classifier.predict_proba([comment])
print(f"offensiveness: {prediction[0][1]}")

# Saves model

In [None]:
with open(SVM_MODEL_PATH, "wb") as svm_model_file, open(NB_MODEL_PATH, "wb") as nb_model_file:
    pk.dump(svm_classifier, svm_model_file)
    pk.dump(nb_classifier, nb_model_file)
