In [1]:
import pandas as pd

raw_tweets = pd.read_csv('TwitterHate.csv')

In [2]:
raw_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


#### Cleaning the text by lower casing all words and removing characters like #@!.? and URL elements:


In [3]:
import re

def  clean_text(df, text):
    df[text] = df[text].str.lower()
    df[text] = df[text].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

tweets_clean = clean_text(raw_tweets, 'tweet')

In [4]:
tweets_clean.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation


In [5]:
tweet_list = tweets_clean.tweet.values
print(tweet_list[0])

  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction   run


#### Removing stop words and redundant terms:

In [6]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

token = TweetTokenizer()
tweets_clean_tkn = [token.tokenize(elem) for elem in tweet_list]
print(tweets_clean_tkn[0])

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', 'run']


In [7]:
stops = stopwords.words('english')
stop_redundant = ['rt','amp']
total_stops = stops + stop_redundant

In [8]:
def remove_stops(sent):
    return [re.sub('#','',term) for term in sent if ((term not in total_stops) & (len(term)>1))]

tweets_clean_final = [remove_stops(tweet) for tweet in tweets_clean_tkn]
print(tweets_clean_final[0])

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']


#### Top 10 Terms in tweets:

In [9]:
from collections import Counter

words = []
for tweet in tweets_clean_final:
    words.extend(tweet)
top = Counter(words)
top.most_common(10)

[('love', 2728),
 ('day', 2277),
 ('happy', 1691),
 ('im', 1157),
 ('life', 1127),
 ('time', 1126),
 ('like', 1105),
 ('today', 1007),
 ('new', 987),
 ('positive', 934)]

#### Reformatting:

In [10]:
tweets_clean_final = [''.join(tweet) for tweet in tweets_clean_final]

In [11]:
#define input:
x = tweets_clean_final
#define output:
y = tweets_clean.label.values

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40) #unspecified test size so used 40/60

#### TF-IDF:

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(max_features = 5000)

x_train_bow = vect.fit_transform(x_train)
x_test_bow = vect.transform(x_test)

#### Model Building:

using ordinary logistic regression*

In [14]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train_bow, y_train)

y_train_pred = logreg.predict(x_train_bow)
y_test_pred = logreg.predict(x_test_bow)

In [15]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_train, y_train_pred)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97     17817
           1       1.00      0.08      0.15      1360

    accuracy                           0.93     19177
   macro avg       0.97      0.54      0.56     19177
weighted avg       0.94      0.93      0.91     19177



#### Looking at class imbalance:

In [16]:
tweets_clean.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [17]:
#recall 0 = non-hate and 1 = hate
percent_hate = 2242/(29720+2242)
print(f'Percent hate speech in this data is: ', percent_hate)

Percent hate speech in this data is:  0.07014579813528565


In [18]:
logreg = LogisticRegression(class_weight="balanced")
logreg.fit(x_train_bow, y_train)
y_train_pred = logreg.predict(x_train_bow)
y_test_pred = logreg.predict(x_test_bow)

accuracy_score(y_train, y_train_pred)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     17817
           1       1.00      0.38      0.55      1360

    accuracy                           0.96     19177
   macro avg       0.98      0.69      0.76     19177
weighted avg       0.96      0.96      0.95     19177



somewhat improved - could try upsampling

#### Regularization, hyperparameter tuning, and cross-val:

In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {'C': [0.01,0.1,1,10,100], 'penalty': ["l1","l2"]}

logreg = LogisticRegression(class_weight="balanced")
grid_search = GridSearchCV(estimator = logreg, param_grid = param_grid, 
                          cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = "recall" )
grid_search.fit(x_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


        nan 0.16764706        nan 0.16764706]


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [20]:
grid_search.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced')

#### Using the best estimator:

In [21]:
y_test_pred = grid_search.best_estimator_.predict(x_test_bow)
y_train_pred = grid_search.best_estimator_.predict(x_train_bow)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     11903
           1       1.00      0.14      0.24       882

    accuracy                           0.94     12785
   macro avg       0.97      0.57      0.61     12785
weighted avg       0.94      0.94      0.92     12785



#### Notes:
F1 score has halfed, despire precision being high and accuracy being 0.94

I would retry the train / test split or remove data for cross validation.

We could manually fix the class imbalance with up or down sampling.