# Importing Libraries

In [1]:
pip install plotly==5.14.1

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import emoji
import string
import nltk
from PIL import Image
from collections import Counter
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import pickle
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86177\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Getting data

In [4]:
data = pd.read_csv('cyberbullying_tweets.csv')

## Renaming columns

In [5]:
data = data.rename(columns={'tweet_text': 'text', 'cyberbullying_type': 'sentiment'})

### Adding Encoded column for sentiments

In [6]:
data["sentiment_encoded"] = data['sentiment'].replace({"religion": 1, "age": 2, "ethnicity": 3, "gender": 4, "other_cyberbullying": 5,"not_cyberbullying": 6})

In [7]:
stop_words = set(stopwords.words('english'))

# Preprocessing of Text

## Function to Remove Emojis

In [8]:
def strip_emoji(text):
    return emoji.replace_emoji(text,replace="")

## Fucntion to Convert text to lowercase, remove (/r, /n  characters), URLs, non-utf characters, Numbers, punctuations,stopwords

In [9]:
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    text = re.sub(r'(.)1+', r'1', text)
    text = re.sub('[0-9]+', '', text)
    stopchars= string.punctuation
    table = str.maketrans('', '', stopchars)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text

## Function to remove contractions

In [10]:
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

## Function to Clean Hashtags

In [11]:
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet))
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet))
    return new_tweet2

## Function to Filter Special Characters such as $, &

In [12]:
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

## Function to remove mutiple sequence spaces

In [13]:
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)


## Function to apply stemming to words

In [14]:
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

## Function to apply lemmatization to words

In [15]:
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

## Function to Preprocess the text by applying all above functions

In [16]:
def preprocess(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    text = lemmatize(text)
    return text

In [43]:
def preprocess(df1):
    df1['cleaned_text'] = df1['text'].apply(strip_emoji)
    df1['cleaned_text'] = df1['text'].apply(decontract)
    df1['cleaned_text'] = df1['text'].apply(strip_all_entities)
    df1['cleaned_text'] = df1['text'].apply(clean_hashtags)
    df1['cleaned_text'] = df1['text'].apply(filter_chars)
    df1['cleaned_text'] = df1['text'].apply(remove_mult_spaces)
    df1['cleaned_text'] = df1['text'].apply(stemmer)
    df1['cleaned_text'] = df1['text'].apply(lemmatize)
    return df1['cleaned_text']

In [17]:
userinput = "pussy pussy #$"

In [44]:
data11 = preprocess(data)
data11

0        In other word # katandandre , your food wa cra...
1        Why is # aussietv so white ? # MKR # theblock ...
2        @ XochitlSuckkks a classy whore ? Or more red ...
3        @ Jason_Gio meh . : P thanks for the head up ,...
4        @ RudhoeEnglish This is an ISIS account preten...
                               ...                        
47687    Black ppl are n't expected to do anything , de...
47688    Turner did not withhold his disappointment . T...
47689    I swear to God . This dumb nigger bitch . I ha...
47690    Yea fuck you RT @ therealexel : IF YOURE A NIG...
47691    Bro . U got ta chill RT @ CHILLShrammy : Dog F...
Name: cleaned_text, Length: 44650, dtype: object

In [19]:
data['cleaned_text'] = data['text'].apply(preprocess)
data.head()

Unnamed: 0,text,sentiment,sentiment_encoded,cleaned_text
0,"In other words #katandandre, your food was cra...",not_cyberbullying,6,word katandandr food crapilici mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,6,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,6,classi whore red velvet cupcak
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,6,meh p thank head concern anoth angri dude twitter
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,6,isi account pretend kurdish account like islam...


## Dealing with Duplicates

In [20]:
data.drop_duplicates("cleaned_text", inplace=True)

# Tokenization

In [21]:
data['tweet_list'] = data['cleaned_text'].apply(word_tokenize)
data.head()

Unnamed: 0,text,sentiment,sentiment_encoded,cleaned_text,tweet_list
0,"In other words #katandandre, your food was cra...",not_cyberbullying,6,word katandandr food crapilici mkr,"[word, katandandr, food, crapilici, mkr]"
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,6,aussietv white mkr theblock imacelebrityau tod...,"[aussietv, white, mkr, theblock, imacelebritya..."
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,6,classi whore red velvet cupcak,"[classi, whore, red, velvet, cupcak]"
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,6,meh p thank head concern anoth angri dude twitter,"[meh, p, thank, head, concern, anoth, angri, d..."
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,6,isi account pretend kurdish account like islam...,"[isi, account, pretend, kurdish, account, like..."


# Checking length of various tweet texts

In [22]:
text_len = []
for text in data.tweet_list:
    tweet_len = len(text)
    text_len.append(tweet_len)
data['text_len'] = text_len

# Removing text without words

In [23]:
data = data[data['text_len']!=0]

In [24]:
data.shape

(44650, 6)

In [25]:
data.head()

Unnamed: 0,text,sentiment,sentiment_encoded,cleaned_text,tweet_list,text_len
0,"In other words #katandandre, your food was cra...",not_cyberbullying,6,word katandandr food crapilici mkr,"[word, katandandr, food, crapilici, mkr]",5
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,6,aussietv white mkr theblock imacelebrityau tod...,"[aussietv, white, mkr, theblock, imacelebritya...",11
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,6,classi whore red velvet cupcak,"[classi, whore, red, velvet, cupcak]",5
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,6,meh p thank head concern anoth angri dude twitter,"[meh, p, thank, head, concern, anoth, angri, d...",9
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,6,isi account pretend kurdish account like islam...,"[isi, account, pretend, kurdish, account, like...",8


In [26]:
sentiments = ["religion", "age", "ethnicity", "gender", "other_cyberbullying","not_cyberbullying"]

# Splitting Data into Train and Test Sets

In [27]:
X,Y = data['cleaned_text'],data['sentiment_encoded']

In [28]:
X

0                       word katandandr food crapilici mkr
1        aussietv white mkr theblock imacelebrityau tod...
2                           classi whore red velvet cupcak
3        meh p thank head concern anoth angri dude twitter
4        isi account pretend kurdish account like islam...
                               ...                        
47687    black ppl expect anyth depend anyth yet free p...
47688    turner withhold disappoint turner call court a...
47689    swear god dumb nigger bitch got bleach hair re...
47690    yea fuck rt your nigger fuck unfollow fuck dum...
47691    bro u got ta chill rt dog fuck kp dumb nigger ...
Name: cleaned_text, Length: 44650, dtype: object

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, stratify =Y, random_state = 42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(31255,) (31255,) (13395,) (13395,)


In [30]:
X_test.head()

41596    realli whoever made pictur honestli go die dum...
6263                                well support lost mine
17071    look like daeshbag complet broken flee koban area
21191                   vote prevent muslim genocid israel
38088    exampl sunset shimmer villain mlp equestria gi...
Name: cleaned_text, dtype: object

## tf-idf Vectorization

In [31]:
from sklearn2pmml import PMMLPipeline, sklearn2pmml

In [33]:
XX = data[['cleaned_text']]

XX.columns

Index(['cleaned_text'], dtype='object')

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn2pmml.feature_extraction.text import Splitter
vectorizer = TfidfVectorizer(analyzer = "word", token_pattern = None, tokenizer = Splitter())

In [130]:
# pip install --upgrade nyoka

In [36]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import LinearSVC
from nyoka import skl_to_pmml

In [139]:
X_train

780                           uu le hacemo bull laurita uu
17323    u r spread venom hindu first check ur islam te...
39189    girl use bulli middl school h show dream last ...
19066    christian woman gay friend support sexual oppr...
44698    personnel broadway sacramento california queer...
                               ...                        
42704    fuck nigger yourst date hope last forev supah ...
32187    share bulli quirkynerd high school obviou bull...
14041    worri bitch live mother even get place name go...
33441    how girl gon na say bulli high school wasnt li...
39289                    cancel she high school fuck bulli
Name: cleaned_text, Length: 31255, dtype: object

In [37]:
mod = LinearSVC(C=1, loss='hinge')
# here you can use the key classifier, if suitable
pipeline = Pipeline([("vect", vectorizer),("model", mod) ])
#training the model
pipeline.fit(X_train,y_train)
# exporting the model
skl_to_pmml(pipeline,XX.columns,"sentiment_encoded","lsvc_tfidf.pmml")


In [45]:
mod = LinearSVC(C=1, loss='hinge')
# here you can use the key classifier, if suitable
pipeline = Pipeline([('preprocess', preprocess),("vect", vectorizer),("model", mod) ])
#training the model
pipeline.fit(X_train,y_train)
# exporting the model
skl_to_pmml(pipeline,XX.columns,"sentiment_encoded","lsvc_tfidf.pmml")


TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '<function preprocess at 0x00000217A5E9F700>' (type <class 'function'>) doesn't

In [128]:

# tf_idf = TfidfVectorizer()
# pipeline = PMMLPipeline([ ("vect", tf_idf) ])
# #training the model
# pipeline.fit(X_train)
# # exporting the model
# sklearn2pmml(pipeline, 'tfidf.pmml', with_repr = True)

In [86]:
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(X_train)
X_test_tf = tf_idf.transform(X_test)
print(X_train_tf.shape)
print(X_test_tf.shape)

(31255, 29111)
(13395, 29111)


In [129]:
# print(X_train_tf)

## Support Vector Classifier

In [87]:
lin_svc = LinearSVC(C=1, loss='hinge')

# Model use

In [88]:
# Train
lin_svc.fit(X_train_tf,y_train)
y_pred = lin_svc.predict(X_test_tf)

# Input

In [97]:
data1 = pd.read_csv('test.csv')
data1 = data1.rename(columns={'tweet_text': 'text'})
data1['cleaned_text'] = data1['text'].apply(preprocess)
data1.head()

Unnamed: 0,text,cleaned_text
0,your are a faggot pussy pussy..,faggot pussi pussi


In [98]:
test11 = data1['cleaned_text']
test11_tf = tf_idf.transform(test11)

In [99]:
test11

0    faggot pussi pussi
Name: cleaned_text, dtype: object

In [100]:
print(test11_tf)

  (0, 20451)	0.8815727952282691
  (0, 8353)	0.47204809788136637


In [101]:
y_pred1 = lin_svc.predict(test11_tf)

In [102]:
print(y_pred1)

[4]
