# Machine Learning Model for Hate Speech Detection

Import relevant libraries

In [58]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [59]:
import nltk
import re 
from nltk.corpus import stopwords

stopword = set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer('english')

Read the data and see the beginning part

In [60]:
data = pd.read_csv('train.csv')
print(data.head())

   count  hate_speech_count  offensive_language_count  neither_count  class  \
0      3                  0                         0              3      2   
1      3                  0                         3              0      1   
2      3                  0                         3              0      1   
3      3                  0                         2              1      1   
4      6                  0                         6              0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


Change the column headers

In [61]:
data['labels'] = data['class'].map({0:'Hate Speech', 1:'Offensive Speech',2:'No Hate No Offense'})
data = data[['tweet','labels']]

In [62]:
data.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate No Offense
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Speech
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Speech
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Speech
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Speech


In [63]:
data.tail()

Unnamed: 0,tweet,labels
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,Offensive Speech
24779,"you've gone and broke the wrong heart baby, an...",No Hate No Offense
24780,young buck wanna eat!!.. dat nigguh like I ain...,Offensive Speech
24781,youu got wild bitches tellin you lies,Offensive Speech
24782,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...,No Hate No Offense


Preprocessing

In [64]:
def clean_tweet(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    
    #text = re.sub(r"[^A-Za-z0-9]+", "", text)

    # Load stopwords (replace 'english' with your desired language)
    stop_words = set(stopwords.words('english'))

    # Filter out stopwords (assuming text is tokenized)
    text = [word for word in text.split() if word not in stop_words]

    # Import and use a stemmer (replace PorterStemmer with your preferred stemmer)
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]

    return ' '.join(text)




In [65]:

data['tweet'] = data['tweet'].apply(clean_tweet)
print(data.head())  

                                               tweet              labels
0  rt mayasolov woman shouldnt complain clean hou...  No Hate No Offense
1  rt mleew17 boy dat coldtyga dwn bad cuffin dat...    Offensive Speech
2  rt urkindofbrand dawg rt 80sbaby4lif ever fuck...    Offensive Speech
3          rt c_g_anderson viva_bas look like tranni    Offensive Speech
4  rt shenikarobert shit hear might true might fa...    Offensive Speech


In [66]:
data.tail()

Unnamed: 0,tweet,labels
24778,you muthafin lie 8220lifeask 20_pearl corey_em...,Offensive Speech
24779,youv gone broke wrong heart babi drove redneck...,No Hate No Offense
24780,young buck wanna eat dat nigguh like aint fuck...,Offensive Speech
24781,youu got wild bitch tellin lie,Offensive Speech
24782,ruffl ntac eileen dahlia beauti color combin p...,No Hate No Offense


Create a numpy array from the tweets and labels

In [67]:
x = np.array(data['tweet'])
y = np.array(data['labels'])

Tokenize: convert the tweets into a matrix of token counts

In [68]:
cv = CountVectorizer()

fit it to the text data

In [69]:
x =cv.fit_transform(x)

Split the dataset

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Initilize the algorithm

In [71]:
model = DecisionTreeClassifier()

Fit in the dataset

In [72]:
model.fit(x_train, y_train)

predict with the test data

In [73]:
y_pred = model.predict(x_test)

Check the accuracy of the model

In [74]:
print(accuracy_score(y_test, y_pred))

0.8752903777967966


Try predicting any other text

In [75]:
i = 'I love you'
i = cv.transform([i]).toarray()
print(model.predict((i)))

['No Hate No Offense']
