Hate Speech Detection Model

In [27]:
#Import data
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
print("Training Set:" % train.columns,train.shape,len(train))
print(train)

test = pd.read_csv('test.csv')
print("Testing Set:" % test.columns,test.shape,len(test))
print(test)

Training Set: (31962, 3) 31962
          id  label                                              tweet
0          1      0   @user when a father is dysfunctional and is s...
1          2      0  @user @user thanks for #lyft credit i can't us...
2          3      0                                bihday your majesty
3          4      0  #model   i love u take with u all the time in ...
4          5      0             factsguide: society now    #motivation
...      ...    ...                                                ...
31957  31958      0  ate @user isz that youuu?ðððððð...
31958  31959      0    to see nina turner on the airwaves trying to...
31959  31960      0  listening to sad songs on a monday morning otw...
31960  31961      1  @user #sikh #temple vandalised in in #calgary,...
31961  31962      0                   thank you @user for you follow  

[31962 rows x 3 columns]
Testing Set: (17197, 2) 17197
          id                                              twe

In [25]:
#Data cleaning
import re


#Check for duplicate and empty data
train.duplicated().sum
train.isnull().sum()

#Text Data Handling
def clean_data(data, text_field):
    data[text_field] = data[text_field].str.lower() #Convert all data into lowercase
    data[text_field] = train[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return data

training_data = clean_data(train,"tweet")
testing_data = clean_data(test,"tweet")


In [None]:
#Handling data imbalance --> Replicating the number of hate comments using upsampling method
from sklearn.utils import resample
# Split data and upsamble to one with the label 0 since it has less data
train_label_1 = training_data[training_data.label == 1]  
train_label_0 = training_data[training_data.label == 0]

# Upsample the minority class (label 1) to match the majority class (label 0
resampled_label_1 = resample(train_label_1,
                             replace=True,
                             n_samples=len(train_label_0),
                             random_state=123)
training_data_upsampled = pd.concat([resampled_label_1,train_label_0])


In [38]:
#Creating a data pipline to get data from user and make prediction

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
import pickle


x = np.array(training_data_upsampled["tweet"])
y = np.array(training_data_upsampled["label"])

cv = TfidfVectorizer()
X = cv.fit_transform(x) #Convert words and letters into matrices
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test,y_test)

with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer','wb') as f:
    pickle.dump(cv, f)