In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np
import re
import nltk
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lizis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# loading data, previews
raw_text = pd.read_csv("raw_data_TDAVIDSON.csv")
raw_text.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
# function for cleaning tweet
def clean_tweet(tweet):
    tweet = tweet.lower() #Lowercase
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)     # Remove URLs
    tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
    tweet = re.sub(r'[^\w\s\']', '', tweet)  # Remove special characters except apostrophes
    tweet = re.sub(r'\s+', ' ', tweet).strip()     # Remove extra whitespace
    tweet = re.sub(r'rt', ' ', tweet).strip() # Remove "rt"
    tweet = re.sub(r'@[^\s]+', '', tweet)  # Remove user handles
    tweet_tokens = tweet.split()  # Split tweet into words
    tweet = ' '.join([word for word in tweet_tokens if word not in stopwords.words('english')])
    return tweet


In [4]:
#Apply cleaning function to function to tweets
cleaned_data = raw_text
cleaned_data["tweet"] = cleaned_data["tweet"].apply(clean_tweet)

In [5]:
#Create Vectorizer
only_text = cleaned_data["tweet"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(only_text)
tfidf = tfidf.toarray()
tfidf.shape
tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
#Bag of Words
word_bag = vectorizer.get_feature_names_out()

In [7]:
#visualize the TF-IDF Values
tfidf_df = pd.DataFrame(tfidf, columns=word_bag)
tfidf_df.head(10)
# We have 24783 tweets and the column have words that appear somewhere in these tweets.
# If a zero is present, implies the word is not in that tweet.

Unnamed: 0,__,___,____,_____,_______,________,__________,_____________,_________________________,________o_____,...,zulu,zvckslvtr,zwaaad,zwengersierra,zwhite,zwithr,zzachbarness,zzzentropy,zzzquil,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Buld a Logistic Regression Model
cleaned_data['hate_speech_binary'] = (cleaned_data['hate_speech_count'] > 0).astype(int)
Y = cleaned_data['hate_speech_binary']
X = tfidf
x_train, x_test, y_train, y_test = train_test_split(X, Y)
log_model = LogisticRegression().fit(x_train, y_train)
log_model.predict_proba(x_test)

array([[0.89466958, 0.10533042],
       [0.39906218, 0.60093782],
       [0.37848761, 0.62151239],
       ...,
       [0.89199254, 0.10800746],
       [0.82780145, 0.17219855],
       [0.81931099, 0.18068901]])

In [10]:
#Highest Coefficients converted into Words
coef = log_model.coef_.reshape(-1)
key_word = np.argmax(coef)
word_bag[key_word]
idx = np.argsort(coef)[-10:]
word_bag[idx]

array(['retard', 'queer', 'niggers', 'white', 'faggots', 'fag', 'niggas',
       'nigga', 'nigger', 'faggot'], dtype=object)

In [11]:
# Model Score
log_model.score(x_test, y_test)

0.8253712072304713

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(
    cleaned_data,
    cleaned_data['hate_speech_binary'],
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=cleaned_data['hate_speech_binary']
)

# Random Forest Algorithm with TF-IDF Vectorizer
from sklearn.ensemble import RandomForestClassifier

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random_Forest',RandomForestClassifier())
])

#2. fit with X_train and y_train
rf_model = clf.fit(x_train_rf, y_train_rf)


#3. get the predictions for X_test and store it in y_pred
y_pred_rf = clf.predict(x_test_rf)

ValueError: Found input variables with inconsistent numbers of samples: [8, 19826]