In [1]:
# Write an NLP algorithm that can classify if a tweet is hate speech or not
# using the twitter dataset provided in the labeled_data.csv file
# Use an SVC classifier

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

# Importing the dataset
dataset = pd.read_csv('labeled_data.csv', encoding = 'latin-1')

# Cleaning the texts
dataset['tweet'] = dataset['tweet'].apply(lambda x: x.lower())
dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
dataset['tweet'] = dataset['tweet'].apply(lambda x: x.split())
ps = PorterStemmer()
dataset['tweet'] = dataset['tweet'].apply(lambda x: [ps.stem(word) for word in x if not word in set(stopwords.words('english'))])
dataset['tweet'] = dataset['tweet'].apply(lambda x: ' '.join(x))

# Creating the Bag of Words model
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(dataset['tweet']).toarray()
y = dataset['class']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.2, random_state = 0)

print('TRAINING')
# Fitting SVC to the Training set
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

print('PREDICTING')

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

# Predicting a new result
new_tweet = 'I hate you'
new_tweet = new_tweet.lower()
new_tweet = re.sub('[^a-zA-Z]', ' ', new_tweet)
new_tweet = new_tweet.split()
new_tweet = [ps.stem(word) for word in new_tweet if not word in set(stopwords.words('english'))]
new_tweet = ' '.join(new_tweet)
new_tweet = cv.transform([new_tweet]).toarray()
new_pred = classifier.predict(new_tweet)
print(new_pred)

TRAINING
PREDICTING
[[  74  172   33]
 [  79 3675   98]
 [  27   78  721]]
0.9017550938067379
[2]


In [3]:
# save the model to a pkl file
import pickle

with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)