In [1]:
import re
import joblib
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading the Dataset

df = pd.read_csv('annotations_metadata.csv')
df = df[df.num_contexts == 0]
df = df[df.label.isin(['hate', 'noHate'])]

df['text'] = ''
for i, file_id in enumerate(df.file_id):
    with open('all_files/' + file_id + '.txt') as fh:
        df.iloc[i, -1] = fh.read()

df = df[['text', 'label']]
df.reset_index(drop=True, inplace=True)

In [3]:
# Pre processing and test train split

def clean_text(text):
    text = re.sub('\w*\d+\w*', '', text)
    text = re.sub('\W+', ' ', text)
    return text

X = df.text.apply(clean_text)
y = df.label.apply(lambda x : 1 if x == 'hate' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [4]:
# Training

clf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=1000)),
    ('SVM', SVC(C=8000, kernel='rbf'))
])

clf.fit(X_train, y_train)

joblib.dump(clf, 'svm_model')

['svm_model']

In [5]:
# Testing

print(f'Test Accuracy: {clf.score(X_test, y_test) * 100}%')
print(f'Test F1 Score: {f1_score(y_test, clf.predict(X_test)) * 100}%')
print(confusion_matrix(y_test, clf.predict(X_test)))

Test Accuracy: 90.3242408646423%
Test F1 Score: 27.131782945736433%
[[1720   30]
 [ 158   35]]


In [6]:
sample_text = "I wonder why the Jewish Media always pretends that all the blood thirsty third world savages are victims and it is all Whitey 's fault ?"

clf = joblib.load('svm_model')
cleaned_sample_text = clean_text(sample_text)

if clf.predict([cleaned_sample_text])[0] == 1:
    print('Hate')
else:
    print('No Hate')

Hate
