In [1]:
import re
from tqdm import tqdm
from collections import Counter

import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

### 1. Load the tweets file from pandas package

In [2]:
data = pd.read_csv('./Input/TwitterHate.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data.shape

(31962, 3)

### 2. Get the tweets into a list for easy text cleanup and manipulation

In [5]:
tweets = list(data['tweet'])

In [6]:
tweets[4000]

"i'm still in a little bit of disbelief that in a couple of months i'm going to be staing my year abroad in the us  "

### 3. To cleanup

#### 3.1. Normalize casing

In [7]:
tweets = [tweet.lower() for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 2982985.35it/s]


#### 3.2. Using regular expressions, remove user handles. These begin with '@’.

In [8]:
tweets = [re.sub(r'@user', '', tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 1823873.42it/s]


In [9]:
tweets[0]

'  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

#### 3.3. Using regular expressions, remove URLs.

In [10]:
tweets = [re.sub(r'http\S+|www\S+', '', tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 1044515.52it/s]


#### 3.4 Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.

In [11]:
tokenizer = TweetTokenizer()

tokens = [tokenizer.tokenize(tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:01<00:00, 30916.20it/s]


In [12]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[lemmatizer.lemmatize(word)for word in token] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:01<00:00, 22974.49it/s]


#### 3.5  Remove stop words.

In [13]:
sw = stopwords.words('english')

In [14]:
tokens = [[word for word in t if word not in sw] for t in tqdm(lemmatized_tokens)] 


100%|██████████| 31962/31962 [00:00<00:00, 61300.10it/s]


#### 3.6 Remove redundant terms like ‘amp’, ‘rt’, etc.

In [15]:
matches = []
for i, token in enumerate(tokens):
    for word in token:
        if re.search(r'&', word) or re.search(r'\b(retweet)\b', word) or re.search(r'\b(rt)\b', word):
            matches.append(word)

top_matches = Counter(matches).most_common(10)

for match, count in top_matches:
    print(f'{match}: {count}')

&: 1774
#retweet: 85
retweet: 23


In [16]:
tokens[17]

['retweet', 'agree', '!']

In [17]:
tokens[22]

['product',
 'day',
 ':',
 'happy',
 'man',
 '#wine',
 'tool',
 "who's",
 '#weekend',
 '?',
 'time',
 'open',
 '&',
 'drink',
 '!']

In [18]:
# remove amp from tokens and print the index of that token
for token in (tokens):
    for i, word in enumerate(token):
        if re.search(r'&', word):
            token[i] = re.sub(r'&', '', word)
        if re.search(r'retweet', word):
            token[i] = re.sub(r'retweet', '', word)
  

In [19]:
tokens[22]

['product',
 'day',
 ':',
 'happy',
 'man',
 '#wine',
 'tool',
 "who's",
 '#weekend',
 '?',
 'time',
 'open',
 '',
 'drink',
 '!']

In [20]:
tokens[17]

['', 'agree', '!']

#### 3.7 Remove ‘#’ symbols from the tweet while retaining the term.

In [21]:
tokens = [[re.sub(r'#', '', word) for word in token] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 229982.39it/s]


### 4. Extra cleanup by removing terms with a length of 1.

In [22]:
tokens = [[word for word in token if len(word) > 1] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 1330089.04it/s]


### 5. Check out the top terms in the tweets

#### 5.1 First, get all the tokenized terms into one large list

In [23]:
def list_tokens(tokens):
  tokens_list = []
  for token in tokens:
      for word in token:
          tokens_list.append(word)
  return tokens_list

#### 5.2 Use the counter and find the 10 most common terms.

In [24]:
counter = Counter(list_tokens(tokens))
counter.most_common(10)

[('...', 2808),
 ('love', 2793),
 ('day', 2744),
 ('happy', 1684),
 ('time', 1237),
 ('life', 1205),
 ('like', 1073),
 ('today', 1024),
 ("i'm", 1018),
 ('new', 994)]

In [25]:
# removing '...' from tokens
for token in (tokens):
    for i, word in enumerate(token):
        if re.search(r'\.{3}', word):
            token[i] = re.sub(r'\.{3}', '', word)
        if re.search(r'\.\.', word):
            token[i] = re.sub(r'\.\.', '', word)
        if re.search(r'\.', word):
            token[i] = re.sub(r'\.', '', word)


In [26]:
counter = Counter(list_tokens(tokens))
counter.most_common(10)

[('', 3604),
 ('love', 2793),
 ('day', 2744),
 ('happy', 1690),
 ('time', 1237),
 ('life', 1205),
 ('like', 1073),
 ('today', 1024),
 ("i'm", 1018),
 ('new', 994)]

In [27]:
# removing the empty strings
tokens = [[word.strip() for word in token if word.strip()] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 1152902.45it/s]


In [28]:
counter = Counter(list_tokens(tokens))
counter.most_common(10)

[('love', 2793),
 ('day', 2744),
 ('happy', 1690),
 ('time', 1237),
 ('life', 1205),
 ('like', 1073),
 ('today', 1024),
 ("i'm", 1018),
 ('new', 994),
 ('get', 992)]

### 6. Data formatting for predictive modeling:

#### 6.1 Join the tokens back to form strings. This will be required for the vectorizers.

In [29]:
# Join the tokens back to form strings
tweets = [' '.join(token) for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 2862353.89it/s]


#### 6.2 assign x and y.

In [30]:
# assign x and y
X = tweets
y = data['label']

#### 6.3 Perform train_test_split using sklearn

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=49)

### 7.  We’ll use TF-IDF values for the terms as a feature to get into a vector space model.

#### 7.1 Import TF-IDF  vectorizer from sklearn.

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### 7.2 Instantiate with a maximum of 5000 terms in your vocabulary.

In [33]:
vectorizer = TfidfVectorizer(max_features=5000)

#### 7.3 Fit and apply on the train set.

In [34]:
# FIT AND apply on the train set
X_train = vectorizer.fit_transform(X_train)

#### 7.4  Apply on the test set.

In [35]:
X_test = vectorizer.transform(X_test)

### 8. Model building: Ordinary Logistic Regression

#### 8.1 Instantiate Logistic Regression from sklearn with default paramters

In [36]:
from sklearn.linear_model import LogisticRegression

#### 8.2 Fit into the train data

In [37]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

#### 8.3 Make predictions for the train and test set.

In [38]:
train_pred = lr_model.predict(X_train)
test_pred = lr_model.predict(X_test)

### 9. Model evaluation: Accuracy, recall, and f_1 score.

In [39]:
from sklearn.metrics import accuracy_score, classification_report

#### 9.1 Report the accuracy on the train set.

In [40]:
print('Train accuracy:', accuracy_score(y_train, train_pred))

Train accuracy: 0.9551801009034377


#### 9.2 Report the recall on the train set: decent, high, or low.

In [41]:
print(classification_report(y_train, train_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     23735
           1       0.95      0.40      0.56      1834

    accuracy                           0.96     25569
   macro avg       0.95      0.70      0.77     25569
weighted avg       0.95      0.96      0.95     25569



The model has a high recall (i.e., it can identify most of the positive samples) for class 0, but a recall of 0.39 for class 1. This indicates that the model may not be performing well for the class 1 samples, and additional steps may be necessary to improve its performance on these samples. 

#### 9.3 Get the f1 score on the train set.

the F1-score for class 0 is 0.98, which is high and indicates good performance. This means that the model is good at correctly identifying class 0 instances, which is the majority class with a support of 23,783.

On the other hand, the F1-score for class 1 is 0.39, which is low and indicates poor performance. This means that the model does not perform well at identifying class 1 instances, which is the minority class with a support of 1,786.

Overall, the macro-average F1-score of 0.95 indicates good performance in terms of correctly identifying both classes. 


### 10. Class imbalance adjustment (model focussing on the 0s)

#### 10.1 Adjust the appropriate class in the LogisticRegression model

In [42]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks

def oversample(X_train, y_train):
    """
    Apply different oversampling techniques to X_train and y_train
    and return the resampled data.
    """
    
    oversamplers = [
        RandomOverSampler(),
        SMOTE(),
        ADASYN(),
        SMOTETomek(),
        SMOTEENN(),
        TomekLinks()
    ]
    X_resampled_list, y_resampled_list = [], []
    for oversampler in oversamplers:
        X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
        X_resampled_list.append(X_resampled)
        y_resampled_list.append(y_resampled)
    
    return X_resampled_list, y_resampled_list

#### Validating the oversampling data using cross validation

In [43]:
from sklearn.model_selection import cross_val_score

X_resampled_list, y_resampled_list = oversample(X_train, y_train)

for i in range(len(X_resampled_list)):
    X_resampled, y_resampled = X_resampled_list[i], y_resampled_list[i]

    lr = LogisticRegression(max_iter=500)

    # Cross-validation set
    scores = cross_val_score(lr, X_resampled, y_resampled, cv=5)
    print(f"Resampled dataset {i+1} mean CV score: {scores.mean()}")


Resampled dataset 1 mean CV score: 0.9543290499262692
Resampled dataset 2 mean CV score: 0.9367811249210029
Resampled dataset 3 mean CV score: 0.9099545117951973
Resampled dataset 4 mean CV score: 0.9375816305034759
Resampled dataset 5 mean CV score: 0.9616169353787697
Resampled dataset 6 mean CV score: 0.948282620478107


### 11. Train again with the adjustment and evaluate.

#### 11.1 Train the model on the train set.

In [44]:
# RandomOverSampler gives the best score and will continue with that
Random_oversampler = RandomOverSampler()
X_train_resampled, y_train_resampled = Random_oversampler.fit_resample(X_train, y_train)

In [45]:
lr_model.fit(X_train_resampled, y_train_resampled)
train_pred = lr_model.predict(X_train_resampled)

#### 11.2 Evaluate the predictions on the train set: accuracy, recall, and f_1 score.

In [46]:
# Evaluate the predictions on the train set: accuracy, recall, and f_1 score.
print('Train accuracy:', accuracy_score(y_train_resampled, train_pred))
print(classification_report(y_train_resampled, train_pred, zero_division=0))

Train accuracy: 0.966399831472509
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     23735
           1       0.95      0.98      0.97     23735

    accuracy                           0.97     47470
   macro avg       0.97      0.97      0.97     47470
weighted avg       0.97      0.97      0.97     47470



The model's accuracy has slightly increased

The model's recall for class is 0.95 and for 1 has increased to 0.98. the model has improved significantly.

The f1 socre for class 0 is 0.97 and has risen to 0.97 for class 0

### 12. Regularization and Hyperparameter tuning

#### 12.1 Import GridSearch and StratifiedKFold because of class imbalance.

In [47]:
# Import GridSearch and StratifiedKFold because of class imbalance.
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#### 12.2  Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters. 

In [48]:
# Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.
param_grid = {'C': [1, 0.1, 0.01],
              'penalty': ['l1', 'l2'],
              'max_iter': [100, 200, 300, 400, 500],
              }

#### 12.3 Use a balanced class weight while instantiating the logistic regression.

In [49]:
lr = LogisticRegression(solver='liblinear', class_weight='balanced')

### 13. Find the parameters with the best recall in cross-validation.

#### 13.1 Choose 'recall' as the metric for scoring.

In [50]:
from sklearn.metrics import make_scorer, recall_score
scorer = make_scorer(recall_score, pos_label=1)

### 13.2 Choose a stratified 4 fold cross-validation sheme

In [51]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=49)
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=skf, scoring=scorer, n_jobs=-1, error_score='raise')

#### 13.3 Fit into the train set.

In [52]:
# fint into the train set
grid_search.fit(X_train_resampled, y_train_resampled)

### 14 What are the best parameters?

In [53]:
print(grid_search.best_params_)

{'C': 1, 'max_iter': 100, 'penalty': 'l1'}


### 15. Predict and evaluate using the best estimator.

#### 15.1 Use the best estimator from the grid search to make predictions on the test set.

In [54]:
# Use the best estimator from the grid search to make predictions on the test set.
print(grid_search.best_score_, grid_search.best_estimator_)
test_pred = grid_search.best_estimator_.predict(X_test)

0.9737939217452998 LogisticRegression(C=1, class_weight='balanced', penalty='l1',
                   solver='liblinear')


#### 15.2 What is the recall on the test set for the toxic comments?

In [55]:
# What is the recall on the test set for the toxic comments?
print(classification_report(y_test, test_pred))


              precision    recall  f1-score   support

           0       0.99      0.95      0.97      5985
           1       0.50      0.80      0.62       408

    accuracy                           0.94      6393
   macro avg       0.74      0.87      0.79      6393
weighted avg       0.95      0.94      0.94      6393



the recall is 0.81 meaning the model correctly identified 81% of class 1

#### 15.3 What is the f_1 score?

the f1 score is 0.62

## Conclusion

After trying hyperparam tuning, various oversampling techniques the scores that I currently have for the 'hate speech' label are the best I could get. 

I think that there needs to be more data for hate speech for the model to perform better