In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import demoji
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from wordcloud import WordCloud
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb
from xgboost import XGBClassifier

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m987.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [None]:
from google.colab import files

uploaded = files.upload()


In [None]:
import io
data = pd.read_csv(io.BytesIO(uploaded['cyberbullying_tweets.csv']))
#data = pd.read_csv('cyberbullying_tweets.csv')
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [None]:
data.cyberbullying_type.value_counts()

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [None]:
fig = px.bar(data.cyberbullying_type.value_counts(), color_discrete_sequence=px.colors.qualitative.Pastel1, custom_data=[data.cyberbullying_type.value_counts()])
fig.update_traces(hovertemplate='<br><b>Total: </b>%{customdata[0]}')
fig.update_layout(title='Cyberbullying types',
                 template='simple_white',
                 hovermode='x unified',
                 xaxis=dict(title='Type'),
                 yaxis=dict(title='Count'),
                 showlegend=False)
fig.show()

In [None]:
data.isnull().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64

In [None]:
# For lemmatize word
import nltk
nltk.download('stopwords')

lemma = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['im', 'wa', 'p', 't', 's', 'o', 'e', 'like'])

def clean_text(text):

    # Remove Hashtag, Mention, https, www.asdfd, dsfadsf.com
    pattern = re.compile(r"(#[A-Za-z0-9]+|@[A-Za-z0-9]+|https?://\S+|www\.\S+|\S+\.[a-z]+|RT @)")
    text = pattern.sub('', text)
    text = " ".join(text.split())

    # Make all text lowercase
    text = text.lower()

    # Lemmatize word
    text = " ".join([lemma.lemmatize(word) for word in text.split()])

    # Remove Punctuation
    remove_punc = re.compile(r"[%s]" % re.escape(string.punctuation))
    text = remove_punc.sub('', text)

    # Remove stopwords
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])

    # Convert emoji to word
    emoji = demoji.findall(text)
    for emot in emoji:
        text = re.sub(r"(%s)" % (emot), "_".join(emoji[emot].split()), text)

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [None]:
ENCODE_DICT = {'not_cyberbullying': 0,
             'gender': 1,
             'religion': 2,
             'other_cyberbullying': 3,
             'age': 4,
             'ethnicity': 5}
data['cyberbullying_type'] = data.cyberbullying_type.replace(ENCODE_DICT)
print(data.cyberbullying_type.unique())
data.sample(10)

[0 1 2 3 4 5]


Unnamed: 0,tweet_text,cyberbullying_type
23492,@Ceff00 @JosephIsVegan @SumbelinaZ @IronmanL1 ...,2
30591,My late sister and I used to be troublemakers ...,3
16639,@jukes303 It's basically an empty excuse that ...,2
31174,Soo FAT need to loose some weight then i might...,3
15136,RT @victorymonk: @MGTOWKnight @highwiregirl oh...,1
43020,I said household you dumb nigger now get lost....,5
7426,@LifeInKhilafah Media lies? What media lies. ...,0
20881,"Wow, what a shame. A corporation decides to su...",2
16412,What illiterate idiots they are. I am astounde...,2
6715,There will probably be another round of dinner...,0


In [None]:
data.tweet_text[data.cyberbullying_type == 1].sample(10)

12254    Question - since when is calling a female coll...
9666     Also - Kat is a completely rank cow but by God...
9337     RT @Ben_Creasey86: #MKR "everyone under estima...
11824    Yes they are NOT racist or sexist, “Gay donkey...
15325    #MileyCyrus Miley Cyrus Makes Gay Date Rape Jo...
13233    First thing I would do if #thePurge was real i...
8342     I remember when I used to say youre gay as an ...
14731    Ummmmmm sorry “bitches” isn’t pc we call them ...
14853    #MKR never met a promo girl that we wars a ful...
12112                        @BristolBen Not aimed at you.
Name: tweet_text, dtype: object

In [None]:
import nltk
nltk.download('punkt')

tfidf = TfidfVectorizer(tokenizer=word_tokenize, min_df=.0005, max_df=.8)
X = tfidf.fit_transform(data.tweet_text).toarray()
y = data.cyberbullying_type.values

print(X.shape, y.shape)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


(47692, 3293) (47692,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=555)

print(f"X train data has shape {X_train.shape} and their label's shape {y_train.shape}")
print(f"X test data has shape {X_test.shape} and their label's shape {y_test.shape}")

X train data has shape (38153, 3293) and their label's shape (38153,)
X test data has shape (9539, 3293) and their label's shape (9539,)


In [None]:
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

params = {'objective': 'multi:softmax' , 'max_depth': 4, 'n_estimators': 10, 'learning_rate': 0.1, 'num_class': 6}
watchlist = [(dtest, 'test'), (dtrain, 'train')]
num_round = 50

bst = xgb.train(params, dtrain, num_round, watchlist, feval=lambda preds, dtrain: ('accuracy', (preds >= 0.5).mean()))

# Evaluate XGBoost model on test set
ypred = bst.predict(dtest)
accuracy = (ypred >= 0.5).mean()
print(f'Test accuracy: {accuracy}')
enh = accuracy
print(enh)

Parameters: { "n_estimators" } are not used.

[0]	test-mlogloss:1.62712	test-accuracy:0.20247	train-mlogloss:1.62598	train-accuracy:0.20490
[1]	test-mlogloss:1.50154	test-accuracy:0.20640	train-mlogloss:1.50022	train-accuracy:0.20877
[2]	test-mlogloss:1.40053	test-accuracy:0.21019	train-mlogloss:1.39891	train-accuracy:0.21245
[3]	test-mlogloss:1.31606	test-accuracy:0.21253	train-mlogloss:1.31413	train-accuracy:0.21475
[4]	test-mlogloss:1.24393	test-accuracy:0.21552	train-mlogloss:1.24198	train-accuracy:0.21709
[5]	test-mlogloss:1.18149	test-accuracy:0.21582	train-mlogloss:1.17891	train-accuracy:0.21771
[6]	test-mlogloss:1.12678	test-accuracy:0.21388	train-mlogloss:1.12421	train-accuracy:0.21578
[7]	test-mlogloss:1.07830	test-accuracy:0.24038	train-mlogloss:1.07560	train-accuracy:0.24117
[8]	test-mlogloss:1.03537	test-accuracy:0.23886	train-mlogloss:1.03209	train-accuracy:0.24009
[9]	test-mlogloss:0.99590	test-accuracy:0.23706	train-mlogloss:0.99266	train-accuracy:0.23838
[10]	test-mlog