In [69]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

### Data loading

In [2]:
header_list = ['tweet_id', 'user_id', 'bullying_traces', 'type', 'form', 'teasing', 'author_role', 'emotion']
csv = pd.read_csv("data.csv", names=header_list)

### Examination of dataset

In [3]:
csv.shape

(7321, 8)

In [4]:
# prikaz skupa za treniranje
csv.head()

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
0,105730486382497793,322329899,y,self-disclosure,other,n,victim,none
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
2,108654820042354688,361869056,n,,,,,
3,102206417217392640,226320672,n,,,,,
4,102779484725448704,297557032,n,,,,,


### Feature description

* **tweet_id** - Tweet identification number
* **user_id** - User identification number
* **bullying_traces** - Boolean value that says if tweet id a bullying trace
* **type** - Type of bullying(accusation, cyberbullying, denial, report, self-disclosure, NA)
* **form** - Form of bullying(cyberbullying, physical, property damage, relational, verbal, other, NA)
* **teasing** - Boolean that says if teasing is involved
* **author_role** - Author role in a tweet(accuser, assistant, bully, defender, reinforcer, reporter, victim, other, NA)
* **emotion** - Emotion expressed in a tweet(anger, embarrassment, empathy, fear, none, other, pride, relief, sadness, NA)

In [5]:
display(csv.isna().sum())

tweet_id              0
user_id               0
bullying_traces       0
type               5219
form               5219
teasing            5219
author_role        5219
emotion            5219
dtype: int64

In [6]:
csv.describe()

Unnamed: 0,tweet_id,user_id
count,7321.0,7321.0
mean,1.03314e+17,170971100.0
std,3121057000000000.0,106288500.0
min,9.970695e+16,779451.0
25%,1.000126e+17,70455500.0
50%,1.029122e+17,164413500.0
75%,1.06125e+17,264606700.0
max,1.091281e+17,365202800.0


In [7]:
csv.nunique()

tweet_id           7298
user_id            7043
bullying_traces       2
type                  5
form                  6
teasing               2
author_role           8
emotion               9
dtype: int64

In [8]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7321 entries, 0 to 7320
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         7321 non-null   int64 
 1   user_id          7321 non-null   int64 
 2   bullying_traces  7321 non-null   object
 3   type             2102 non-null   object
 4   form             2102 non-null   object
 5   teasing          2102 non-null   object
 6   author_role      2102 non-null   object
 7   emotion          2102 non-null   object
dtypes: int64(2), object(6)
memory usage: 457.7+ KB


In [9]:
csv

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
0,105730486382497793,322329899,y,self-disclosure,other,n,victim,none
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
2,108654820042354688,361869056,n,,,,,
3,102206417217392640,226320672,n,,,,,
4,102779484725448704,297557032,n,,,,,
...,...,...,...,...,...,...,...,...
7316,99912247060611074,162589166,y,self-disclosure,other,n,defender,empathy
7317,99878062220517376,221054960,y,accusation,other,n,accuser,none
7318,100049839949221889,110032215,n,,,,,
7319,100006640492875776,171842492,y,denial,other,n,bully,none


### Tweet overview

In [10]:
import json


f = open('tweet.json')
tweets = json.load(f)

counter = 0
for tweet in tweets:
    counter += 1
    print(tweet['text'])
    
    if counter == 5:
        break

@Underwalt @MsShandraRae @kateplusmy8 mr. walt, stop being a bully.
【自動post】BULLY（いじめ）とSUICIDE（自殺）で、BULLYCIDE（いじめ自殺）。もっとも、自殺するのは憎い相手を殺した後。死なばもろとも…ってところかしら
Paula, Kiersten and LJ's song about bullying. Great job! (Uploading more videos now.) http://fb.me/ASIm1gw1
@katie_ogden @boybandslut @StephWall27 @BAMitsSHAZ @xstephhh_ I was not bullying her! She's trying to steal Jean Martyn from meee! :'( xx
cara3: bully iam gara2 uda di ucpin tpi gk bls


In [11]:
tweets[0].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

### Maybe important features: 
* **text** 
* **symbols** 
* **in_reply_to_status_id**
* **in_reply_to_user_id**
* **description**
* **retweet_count**
* **retweeted**
* **favorite_count**
* **favorited**
* **lang**


In [12]:
eng_tweets = []
eng_tweet_ids = []
for tweet in tweets:
    if tweet['lang'] == 'en':
        eng_tweets.append(tweet)
        eng_tweet_ids.append(tweet['id'])
        
        
print(f'Number of tweets:{len(tweets)}')        
print(f'Number of english tweets:{len(eng_tweets)}')
    

Number of tweets:3992
Number of english tweets:2522


In [13]:
# here we filter out non english tweets
eng_tweets_csv = csv.loc[csv["tweet_id"].isin(eng_tweet_ids)]
eng_tweets_csv

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
4,102779484725448704,297557032,n,,,,,
9,109034091743154176,177913822,n,,,,,
10,102533497637437441,70412906,y,report,other,n,reporter,none
14,103679008855691264,11363462,n,,,,,
...,...,...,...,...,...,...,...,...
7306,99823454777393152,48203749,n,,,,,
7307,100063708629319680,305258003,n,,,,,
7314,99886032048226304,21235814,n,,,,,
7319,100006640492875776,171842492,y,denial,other,n,bully,none


In [34]:
eng_tweets_csv['bullying_traces'].value_counts()

n    1495
y    1027
Name: bullying_traces, dtype: int64

#### Tweet normalization

In [35]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())



print(
    normalizeTweet(
        "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
    )
)

SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL ... via @USER


In [36]:
print(normalizeTweet("DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier 😢"))

DHEC confirms HTTPURL ... via @USER :crying_face:


In [37]:
tweet_number = 0

print(eng_tweets[tweet_number]['text'])
print(normalizeTweet(eng_tweets[tweet_number]['text']))

@Underwalt @MsShandraRae @kateplusmy8 mr. walt, stop being a bully.
@USER @USER @USER mr . walt , stop being a bully .


#### BERT feature extraction

In [21]:
import torch
from transformers import AutoModel, AutoTokenizer 

bertweet = AutoModel.from_pretrained("vinai/bertweet-large")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# INPUT TWEET IS ALREADY NORMALIZED!
line = "DHEC confirms HTTPURL via @USER :crying_face:"

input_ids = torch.tensor([tokenizer.encode(line)])

with torch.no_grad():
    features = bertweet(input_ids)  # Models outputs are now tuples
    
## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# bertweet = TFAutoModel.from_pretrained("vinai/bertweet-large")

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [20]:
print('vector size for each word:',len(features[0][0][0]))

vector size for each word: 1024


#### Preparing the training data

In [38]:
# Normalizing the tweets
import copy
normalized_tweets = copy.deepcopy(eng_tweets)
for i in range(len(eng_tweets)):
    normalized_tweets[i]['text'] = normalizeTweet(normalized_tweets[i]['text'])

counter = 0
for tweet in normalized_tweets:
    counter += 1
    print(tweet['text'])
    
    if counter == 5:
        break    

@USER @USER @USER mr . walt , stop being a bully .
Paula , Kiersten and LJ 's song about bullying . Great job ! ( Uploading more videos now . ) HTTPURL
@USER @USER @USER @USER @USER I was not bullying her ! She 's trying to steal Jean Martyn from meee ! :'( xx
This is actually bullying , I want Wenger to watch this and fucking hold every piece of criticism that comes ! #RAGE
If I 'm not happy with my classes this week , I 'm gonna take the role as a bully and be an asshole . Watch out lower-classmen . It will be ugly .


In [23]:
# getting label for tweet
eng_tweets_csv.loc[eng_tweets_csv["tweet_id"]==eng_tweets[1]['id']]['bullying_traces'].values[0]

'n'

In [57]:
# x = []
# y = []

# for tweet in normalized_tweets:
#     y.append(eng_tweets_csv.loc[eng_tweets_csv["tweet_id"]==tweet['id']]['bullying_traces'].values[0])
#     input_ids = torch.tensor([tokenizer.encode(tweet['text'])])

#     with torch.no_grad():
#         features = bertweet(input_ids)
#         x.append(features)

# # saving x i y
# with open('x_instances.pkl', 'wb') as f:
#     pickle.dump(x, f)
# with open('y_instances.pkl', 'wb') as f:
#     pickle.dump(y, f)

In [12]:
# loading x and y
with open('x_instances.pkl', 'rb') as f:
    x = pickle.load(f)
with open('y_instances.pkl', 'rb') as f:
    y = pickle.load(f)

In [53]:
# max_length = 0
# for item in x:
#     if len(item) > max_length:
#         max_length = len(item)
        
# for i in range(len(x)):
#     if len(x[i]) < max_length:
#         for j in range(max_length - len(x[i])):
#             x[i] = torch.cat((x[i], torch.zeros(1,1024)), 0)

In [13]:
# for i in range(len(x)):
#     x[i] =  x[i].last_hidden_state[0]

In [58]:
# split the data in train and test
train_test_ratio = 0.8
train_size = int(0.8 * len(x))

x_train = x[0:train_size]
x_test = x[train_size:]

y_train = y[0:train_size]
y_test = y[train_size:]

In [73]:
len(x) - train_size

505

#### Baseline models

In [59]:
# split the data in train and test for baseline
train_test_ratio = 0.8
train_size = int(0.8 * len(x))

x_train_baseline = []
for i in range(train_size):
    x_train_baseline.append((x[i].reshape(1,-1)[0].detach().numpy()))
x_train_baseline = np.array(x_train_baseline)
    
x_test_baseline = []
for i in range(train_size,len(x)):
    x_test_baseline.append(x[i].reshape(1,-1)[0].detach().numpy())
x_test_baseline = np.array(x_test_baseline)    

y_train_baseline = []
for i in range(train_size):
    if y[i] == 'y':
        y_train_baseline.append(1)
    else:
        y_train_baseline.append(0)
y_train_baseline = np.array(y_train_baseline)
    
y_test_baseline = []
for i in range(train_size,len(y)):
    if y[i] == 'y':
        y_test_baseline.append(1)
    else:
        y_test_baseline.append(0)
y_test_baseline = np.array(y_test_baseline)   

##### Logistic regression

In [74]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(x_train_baseline, y_train_baseline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
y_pred_baseline_logReg = clf.predict(x_test_baseline)
print(classification_report(y_test_baseline, y_pred_baseline_logReg))
print(accuracy_score(y_test_baseline, y_pred_baseline_logReg))
print(confusion_matrix(y_test_baseline, y_pred_baseline_logReg))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       280
           1       0.74      0.67      0.71       225

    accuracy                           0.75       505
   macro avg       0.75      0.74      0.74       505
weighted avg       0.75      0.75      0.75       505

0.7504950495049505
[[228  52]
 [ 74 151]]


##### Naive Bayes

In [67]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_trained = gnb.fit(x_train_baseline, y_train_baseline)

In [71]:
y_pred_baseline_bayes = gnb_trained.predict(x_test_baseline)
print(classification_report(y_test_baseline, y_pred_baseline_bayes))
print(accuracy_score(y_test_baseline, y_pred_baseline_bayes))
print(confusion_matrix(y_test_baseline, y_pred_baseline_bayes))

              precision    recall  f1-score   support

           0       0.50      0.00      0.01       280
           1       0.45      1.00      0.62       225

    accuracy                           0.45       505
   macro avg       0.47      0.50      0.31       505
weighted avg       0.48      0.45      0.28       505

0.44554455445544555
[[  1 279]
 [  1 224]]


#### Custom torch model