# Tweets Abuse Classification

### By: Soorya Parthiban

## Problem Statement: Build a machine learning model to classify the abuse tweets.

## Importing Library & Dataset

In [1]:
import numpy as np
import pandas as pd 

import warnings
warnings.filterwarnings("ignore")

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
!ls '/content/drive/'

MyDrive


In [11]:
!ls '/content/drive/MyDrive/'

'Brain Dataset'				      twitter_data_test.xls
 Brain-Tumour-Dataset			      twitter_data_train.csv
'Cats&Dogs'				      Xception.h5
'Colab Notebooks'			      x_test.pkl
'Copy of it7510-iter2-xception-model.ipynb'   x_train.pkl
 InceptionResNetV2.h5			      y_test.pkl
 it7510-iter2-xception-model.ipynb	      y_train.pkl


In [12]:
twitter_df = pd.read_csv('/content/drive/MyDrive/twitter_data_train.csv')

In [13]:
twitter_df.head()

Unnamed: 0,tweet,label
0,start your day w your daily here,3
1,even a perfect life doesn’t feel perfect in so...,3
2,great ready for next week s q amp a with white...,3
3,5 ways to protect your mental health during th...,3
4,i m officially an occupational therapist passe...,3


In [14]:
twitter_df.tail()

Unnamed: 0,tweet,label
2345,discussion tonight tune in fb live,3
2346,as current events are showing change can be su...,3
2347,beyond grateful to join in an open and honest ...,3
2348,to some degree we are all feeling if you are f...,3
2349,‘you’re in charge’ sessions on amp 💚#teampatie...,3


In [15]:
twitter_df['label'].describe().T

count    2350.000000
mean        2.895319
std         0.381706
min         1.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         3.000000
Name: label, dtype: float64

In [16]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2350 entries, 0 to 2349
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   2349 non-null   object
 1   label   2350 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 36.8+ KB


In [17]:
twitter_df.isnull().sum()

tweet    1
label    0
dtype: int64

In [18]:
twitter_df.dropna(inplace=True)

In [31]:
twitter_df.isnull().sum()

tweet    0
label    0
dtype: int64

In [32]:
twitter_df['label'].value_counts()

3    2164
2     124
1      61
Name: label, dtype: int64

## Data Pre-Processing

In [20]:
twitter_df['tweet'] = twitter_df['tweet'].str.lower()


def remove_special_characters(text, remove_digits=True):
  pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
  text = re.sub(pattern, '', str(text))
  return text

twitter_df['tweet'] = twitter_df['tweet'].apply(remove_special_characters, remove_digits=False)

def f(r):
  wnl = WordNetLemmatizer()
  words = nltk.word_tokenize(r)
  lemmatized_words = [wnl.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  return " ".join(lemmatized_words)

twitter_df['tweet'] = twitter_df['tweet'].apply(f)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_X = cv.fit_transform(twitter_df['tweet']).toarray()
data_X = pd.DataFrame(data_X, columns=cv.get_feature_names())

In [24]:
X = data_X
y = twitter_df.label

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [26]:
print(X_train.shape, X_val.shape)

(1644, 5999) (705, 5999)


## Building the ML Models

In [58]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

### 1. Decision Tree  

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
model1 = DecisionTreeClassifier()

In [29]:
model1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [33]:
y_preds_model1 = model1.predict(X_val)

In [59]:
print("Decision Tree model's accuracy: ", accuracy_score(y_val, y_preds_model1))

Decision Tree model's accuracy:  0.8680851063829788


In [60]:
print("Decision Tree model's F1 Score: ", f1_score(y_val, y_preds_model1, average='weighted')) 

Decision Tree model's F1 Score:  0.8541723784489001


In [36]:
print("Decision Tree model's Classification Report: \n", classification_report(y_val, y_preds_model1))

Decision Tree model's Classification Report: 
               precision    recall  f1-score   support

           1       0.10      0.06      0.07        17
           2       0.10      0.07      0.08        46
           3       0.91      0.95      0.93       642

    accuracy                           0.87       705
   macro avg       0.37      0.36      0.36       705
weighted avg       0.84      0.87      0.85       705



### 2. Random Forest  

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
model2 = RandomForestClassifier(n_estimators=3000)

In [39]:
model2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=3000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
y_preds_model2 = model2.predict(X_val)

In [61]:
print("Random Forest model's accuracy: ", accuracy_score(y_val, y_preds_model2))

Random Forest model's accuracy:  0.9106382978723404


In [62]:
print("Random Forest model's F1 Score: ", f1_score(y_val, y_preds_model2, average='weighted')) 

Random Forest model's F1 Score:  0.8706631396088013


In [42]:
print("Random Forest model's Classification Report: \n", classification_report(y_val, y_preds_model2))

Random Forest model's Classification Report: 
               precision    recall  f1-score   support

           1       1.00      0.06      0.11        17
           2       0.00      0.00      0.00        46
           3       0.91      1.00      0.95       642

    accuracy                           0.91       705
   macro avg       0.64      0.35      0.35       705
weighted avg       0.85      0.91      0.87       705



### 3. Extra Tree  

In [44]:
from sklearn.ensemble import ExtraTreesClassifier

In [45]:
model3 = ExtraTreesClassifier(n_estimators=3000)

In [46]:
model3.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=3000,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [47]:
y_preds_model3 = model3.predict(X_val)

In [63]:
print("Extra Tree model's accuracy: ", accuracy_score(y_val, y_preds_model3))

Extra Tree model's accuracy:  0.9063829787234042


In [64]:
print("Extra Tree model's F1 Score: ", f1_score(y_val, y_preds_model3, average='weighted')) 

Extra Tree model's F1 Score:  0.8685320828327455


In [49]:
print("Extra Tree model's Classification Report: \n", classification_report(y_val, y_preds_model3))

Extra Tree model's Classification Report: 
               precision    recall  f1-score   support

           1       1.00      0.06      0.11        17
           2       0.00      0.00      0.00        46
           3       0.91      0.99      0.95       642

    accuracy                           0.91       705
   macro avg       0.64      0.35      0.35       705
weighted avg       0.85      0.91      0.87       705



### 4. XGB Classifier  

In [50]:
from xgboost import XGBClassifier

In [53]:
model4 = XGBClassifier(n_estimators=500)

In [54]:
model4.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [55]:
y_preds_model4 = model4.predict(X_val)

In [65]:
print("XGBoost model's accuracy: ", accuracy_score(y_val, y_preds_model4))

XGBoost model's accuracy:  0.9078014184397163


In [66]:
print("XGBoost model's F1 Score: ", f1_score(y_val, y_preds_model4, average='weighted')) 

XGBoost model's F1 Score:  0.8740743105128185


In [57]:
print("XGBoost model's Classification Report: \n", classification_report(y_val, y_preds_model4))

XGBoost model's Classification Report: 
               precision    recall  f1-score   support

           1       0.40      0.12      0.18        17
           2       0.25      0.02      0.04        46
           3       0.92      0.99      0.95       642

    accuracy                           0.91       705
   macro avg       0.52      0.38      0.39       705
weighted avg       0.86      0.91      0.87       705



## Predicting the Output For Testing Dataset

In [69]:
test_data = pd.read_csv('/content/drive/MyDrive/twitter_data_test.csv')

In [70]:
test_data.head()

Unnamed: 0,tweet
0,thank you cadre we must remove any and all bar...
1,sweden is now operating a fleet of ambulances ...
2,how do working hours and job strain relate to ...
3,end panic you can have a future free of
4,our team understand that mental health workers...


In [71]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   784 non-null    object
dtypes: object(1)
memory usage: 6.2+ KB


In [72]:
test_data['tweet'] = test_data['tweet'].str.lower()


def remove_special_characters(text, remove_digits=True):
  pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
  text = re.sub(pattern, '', str(text))
  return text

test_data['tweet'] = test_data['tweet'].apply(remove_special_characters, remove_digits=False)

def f(r):
  wnl = WordNetLemmatizer()
  words = nltk.word_tokenize(r)
  lemmatized_words = [wnl.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  return " ".join(lemmatized_words)

test_data['tweet'] = test_data['tweet'].apply(f)

In [73]:
test = cv.transform(test_data['tweet']).toarray()
test_data = pd.DataFrame(test, columns=cv.get_feature_names())

In [74]:
target = model2.predict(test_data)

In [75]:
# To create Dataframe of predicted value with particular respective index
res = pd.DataFrame(target) # target are nothing but the final predictions of your model on input features of your new unseen test data
res.columns = ["prediction"]

# To download the csv file locally
from google.colab import files
res.to_csv('submission.csv', index = False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>