In [1]:
import numpy as np
import pandas as pd

### Data loading

In [2]:
header_list = ['tweet_id', 'user_id', 'bullying_traces', 'type', 'form', 'teasing', 'author_role', 'emotion']
csv = pd.read_csv("data.csv", names=header_list)

### Examination of dataset

In [3]:
csv.shape

(7321, 8)

In [4]:
# prikaz skupa za treniranje
csv.head()

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
0,105730486382497793,322329899,y,self-disclosure,other,n,victim,none
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
2,108654820042354688,361869056,n,,,,,
3,102206417217392640,226320672,n,,,,,
4,102779484725448704,297557032,n,,,,,


### Feature description

* **tweet_id** - Tweet identification number
* **user_id** - User identification number
* **bullying_traces** - Boolean value that says if tweet id a bullying trace
* **type** - Type of bullying(accusation, cyberbullying, denial, report, self-disclosure, NA)
* **form** - Form of bullying(cyberbullying, physical, property damage, relational, verbal, other, NA)
* **teasing** - Boolean that says if teasing is involved
* **author_role** - Author role in a tweet(accuser, assistant, bully, defender, reinforcer, reporter, victim, other, NA)
* **emotion** - Emotion expressed in a tweet(anger, embarrassment, empathy, fear, none, other, pride, relief, sadness, NA)

In [5]:
display(csv.isna().sum())

tweet_id              0
user_id               0
bullying_traces       0
type               5219
form               5219
teasing            5219
author_role        5219
emotion            5219
dtype: int64

In [6]:
csv.describe()

Unnamed: 0,tweet_id,user_id
count,7321.0,7321.0
mean,1.03314e+17,170971100.0
std,3121057000000000.0,106288500.0
min,9.970695e+16,779451.0
25%,1.000126e+17,70455500.0
50%,1.029122e+17,164413500.0
75%,1.06125e+17,264606700.0
max,1.091281e+17,365202800.0


In [7]:
csv.nunique()

tweet_id           7298
user_id            7043
bullying_traces       2
type                  5
form                  6
teasing               2
author_role           8
emotion               9
dtype: int64

In [8]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7321 entries, 0 to 7320
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         7321 non-null   int64 
 1   user_id          7321 non-null   int64 
 2   bullying_traces  7321 non-null   object
 3   type             2102 non-null   object
 4   form             2102 non-null   object
 5   teasing          2102 non-null   object
 6   author_role      2102 non-null   object
 7   emotion          2102 non-null   object
dtypes: int64(2), object(6)
memory usage: 457.7+ KB


In [9]:
csv

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
0,105730486382497793,322329899,y,self-disclosure,other,n,victim,none
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
2,108654820042354688,361869056,n,,,,,
3,102206417217392640,226320672,n,,,,,
4,102779484725448704,297557032,n,,,,,
...,...,...,...,...,...,...,...,...
7316,99912247060611074,162589166,y,self-disclosure,other,n,defender,empathy
7317,99878062220517376,221054960,y,accusation,other,n,accuser,none
7318,100049839949221889,110032215,n,,,,,
7319,100006640492875776,171842492,y,denial,other,n,bully,none


### Tweet overview

In [10]:
import json


f = open('tweet.json')
tweets = json.load(f)

counter = 0
for tweet in tweets:
    counter += 1
    print(tweet['text'])
    
    if counter == 5:
        break

@Underwalt @MsShandraRae @kateplusmy8 mr. walt, stop being a bully.
【自動post】BULLY（いじめ）とSUICIDE（自殺）で、BULLYCIDE（いじめ自殺）。もっとも、自殺するのは憎い相手を殺した後。死なばもろとも…ってところかしら
Paula, Kiersten and LJ's song about bullying. Great job! (Uploading more videos now.) http://fb.me/ASIm1gw1
@katie_ogden @boybandslut @StephWall27 @BAMitsSHAZ @xstephhh_ I was not bullying her! She's trying to steal Jean Martyn from meee! :'( xx
cara3: bully iam gara2 uda di ucpin tpi gk bls


In [11]:
tweets[0].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

### Maybe important features: 
* **text** 
* **symbols** 
* **in_reply_to_status_id**
* **in_reply_to_user_id**
* **description**
* **retweet_count**
* **retweeted**
* **favorite_count**
* **favorited**
* **lang**


In [12]:
eng_tweets = []
eng_tweet_ids = []
for tweet in tweets:
    if tweet['lang'] == 'en':
        eng_tweets.append(tweet)
        eng_tweet_ids.append(tweet['id'])
        
        
print(f'Number of tweets:{len(tweets)}')        
print(f'Number of english tweets:{len(eng_tweets)}')
    

Number of tweets:3992
Number of english tweets:2522


In [13]:
# here we filter out non english tweets
eng_tweets_csv = csv.loc[csv["tweet_id"].isin(eng_tweet_ids)]
eng_tweets_csv

Unnamed: 0,tweet_id,user_id,bullying_traces,type,form,teasing,author_role,emotion
1,107688644067856384,185389094,y,self-disclosure,other,n,defender,empathy
4,102779484725448704,297557032,n,,,,,
9,109034091743154176,177913822,n,,,,,
10,102533497637437441,70412906,y,report,other,n,reporter,none
14,103679008855691264,11363462,n,,,,,
...,...,...,...,...,...,...,...,...
7306,99823454777393152,48203749,n,,,,,
7307,100063708629319680,305258003,n,,,,,
7314,99886032048226304,21235814,n,,,,,
7319,100006640492875776,171842492,y,denial,other,n,bully,none


In [14]:
eng_tweets_csv['bullying_traces'].value_counts()

n    1495
y    1027
Name: bullying_traces, dtype: int64

#### Tweet normalization

In [54]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())



print(
    normalizeTweet(
        "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
    )
)

SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL ... via @USER


In [68]:
print(normalizeTweet("DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier 😢"))

DHEC confirms HTTPURL ... via @USER :crying_face:


In [64]:
tweet_number = 125

print(eng_tweets[tweet_number]['text'])
print(normalizeTweet(eng_tweets[tweet_number]['text']))

@owillis @rootless_e don't know their plan so weak leaders they r like follwers on school payground they go after the kid the bully go after
@USER @USER do n't know their plan so weak leaders they r like follwers on school payground they go after the kid the bully go after


#### BERT feature extraction

In [18]:
import torch
from transformers import AutoModel, AutoTokenizer 

bertweet = AutoModel.from_pretrained("vinai/bertweet-large")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# INPUT TWEET IS ALREADY NORMALIZED!
line = "DHEC confirms HTTPURL via @USER :crying_face:"

input_ids = torch.tensor([tokenizer.encode(line)])

with torch.no_grad():
    features = bertweet(input_ids)  # Models outputs are now tuples
    
## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# bertweet = TFAutoModel.from_pretrained("vinai/bertweet-large")

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [19]:
features

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0862,  0.0521, -0.1026,  ...,  0.0147,  0.1891, -0.0736],
         [ 0.1007, -0.0159, -0.6580,  ...,  0.1054,  0.2050,  0.0699],
         [ 0.2216,  0.0395, -0.5482,  ..., -0.1893,  0.3768,  0.2154],
         ...,
         [ 0.0523,  0.2308, -0.8912,  ...,  0.3122, -0.0351, -0.0617],
         [-0.0252, -0.1803, -0.0527,  ...,  0.3899, -0.1398, -0.2169],
         [ 0.0499,  0.0034, -0.1559,  ..., -0.0495,  0.1139, -0.0543]]]), pooler_output=tensor([[-0.2438, -0.0983, -0.2807,  ...,  0.2079,  0.2894, -0.6219]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)