## Environment Setting

In [None]:
import datetime
import csv
import re
import numpy as np
import pickle
import os
import keras
from nltk import word_tokenize
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from keras.optimizers import *
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

## Hyperparameter Setting

In [None]:
MAX_SENT_LENGTH=30
MAX_SENTS=140
HISTORY_LENGTH=100
MAX_EPOCHS=7
MAX_TIMES=20
BATCH_SIZE=16
train_valtest_split=0.8
# Download the vine data from Rahat Ibn Rafiq et al. 2015. Careful what you share in six seconds: Detecting cyberbullying instances in Vine. 
Vine_data_path='data/vine/'
# Download the pretrained word2vec embedding from the links below and replace them with your data paths
w2vVocab_file='https://drive.google.com/file/d/1PqHeFg8QvXX_vGv54kgjRZK89XO6Gde0/view?usp=sharing'
w2vVector_file='https://drive.google.com/file/d/1onEQw6yFAslUNlZeuviWp6_UyVyoB3KM/view?usp=sharing'

## Preprocess Data

In [None]:
# Convert time to hours
def timeconvert(timestr,start_time):
    ifpm=False
    ntp=datetime.datetime.strptime(timestr, "%Y-%m-%d %H:%M:%S")
    try:
        otp=datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
    except:
        start_time='2'+start_time
        otp=datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
    delta=ntp-otp
    hours=delta.days*24+delta.seconds/3600
    return hours

In [None]:
# Map pretrained word embedding
def generate_indices_embedding(tokenized_text):
    w2vVocab=np.load(w2vVocab_file,allow_pickle=True).item()
    w2vVector=np.load(w2vVector_file)

    word_indices_count={'PADDING':[0,99999]}
    word_indices={'PADDING':0}
    embedding_index={'PADDING':0}
    
    for sessions in tokenized_text:
        for sentence in sessions:
            for word in sentence:
                if word not in word_indices_count:
                    word_indices_count[word]=[len(word_indices_count),1]
                else:
                    word_indices_count[word][1]+=1
    for word in word_indices_count.keys():
        word_indices[word]=len(word_indices)

    count=0
    for word in word_indices.keys():
        if word in w2vVocab.keys():
            count+=1 
        elif word_indices_count[word][1]>3:
            count+=1
    lister=np.zeros((count,400),dtype='float32')
        
    for word in word_indices.keys():
        if word in w2vVocab.keys() and word!='PADDING':
            embedding_index[word]=len(embedding_index)
            lister[embedding_index[word]]=w2vVector[w2vVocab[word]]
    reference=lister[:len(embedding_index)-1]
    mu=np.mean(reference, axis=0)
    Sigma=np.cov(reference.T)
    
    for word in word_indices_count.keys():
        if word not in embedding_index.keys() and word_indices_count[word][1]>3:
            embedding_index[word]=len(embedding_index)
            lister[embedding_index[word]]=np.random.multivariate_normal(mu, Sigma, 1)
    
    return embedding_index,lister
def generate_word_index(session_tokens,embedding_index):
    text_word_indices=[]
    for session in session_tokens:
        session_index=[]
        for sentence in session:
            text_word_index=[]
            for word in sentence:
                if word in embedding_index:
                    text_word_index.append(embedding_index[word])
                else:
                    text_word_index.append(embedding_index['PADDING'])
            text_word_index=text_word_index[:MAX_SENT_LENGTH]+(MAX_SENT_LENGTH-len(text_word_index))*[0]
            session_index.append(text_word_index)
        session_index=session_index[:MAX_SENTS]+(MAX_SENTS-len(session_index))*[[0]*MAX_SENT_LENGTH]
        text_word_indices.append(session_index)
    return text_word_indices
def generate_history_word_index(session_tokens,embedding_index):
    text_word_indices=[]
    for session in session_tokens:
        session_index=[]
        for user in session:
            user_index=[]
            for sentence in user:
                for word in sentence:
                    if word in embedding_index:
                        user_index.append(embedding_index[word])
                    else:
                        user_index.append(embedding_index['PADDING'])
            user_index=user_index[:HISTORY_LENGTH]+(HISTORY_LENGTH-len(user_index))*[0]
            session_index.append(user_index)
        session_index=session_index[:MAX_SENTS]+(MAX_SENTS-len(session_index))*[[0]*HISTORY_LENGTH]
        text_word_indices.append(session_index)
    return text_word_indices

In [None]:
# Load session data
dataDict=[]
with open(Vine_data_path+'vine_labeled_cyberbullying_data.csv', 'r',encoding='unicode_escape') as f:
    reader = csv.DictReader((line.replace('\0','') for line in f))
    for row in reader:
        dataDict.append(row)

In [None]:
# Remove redundant session info
removeList=['_golden','_unit_state','_unit_id','_trusted_judgments','_last_judgment_at','img_url']
for row in dataDict:
    row['creationtime']=row['creationtime'].split('posted at:')[-1].strip()
    row['creationtime']=row['creationtime'].replace('.000000','')
    row['creationtime']=row['creationtime'].replace('T',' ')
    for keys in list(row):
        if (keys in removeList) or (keys[:4]=='colu' and row[keys]=='empty'):
            del row[keys]
        elif keys[:4]=='colu':
            row[keys]=row[keys].replace('<font color="#0066CC">',"")
            row[keys]=row[keys].replace('</font>::',"&&&&&")
            row[keys]=row[keys].replace('(created at:','(created_at:')
            row[keys]=row[keys].split('(created_at:')
            if len(row[keys])>1:
                row[keys]=[row[keys][0].strip(),row[keys][1].strip(')')]
                row[keys][1]=row[keys][1].replace('.000000','')
                row[keys]=[row[keys][0].split("&&&&&")[0],row[keys][0].split("&&&&&")[1],row[keys][1].replace('T',' ')]
                row[keys][0]=row[keys][0].lower()
                row[keys][1]=row[keys][1].lower()
                new_str=re.sub(r'[\x80-\xFF]+','',row[keys][1])
                if new_str!=row[keys][1]:
                    row[keys][1]=re.sub('\_*','',new_str)
                row[keys][1]=word_tokenize(row[keys][1])
            else:
                del row[keys]
    try:
        datetime.datetime.strptime(row['creationtime'], "%Y-%m-%d %H:%M:%S")
    except:
        for i in range(10):
            try:
                row['creationtime']=row['column'+str(i)][2]
                break
            except:
                pass
    for keys in list(row):
        if keys[:4]=='colu':
            row[keys][2]=timeconvert(row[keys][2],row['creationtime'])
    row['likecount']=row['likecount'].split('\n\n ')[0]
    row['username']=row['username'].replace('<font color="#0066CC">',"")
    row['username']=row['username'].replace('</font>',"")
    new_cptn=re.sub(r'[\x80-\xFF]+','',row['mediacaption'])
    if new_cptn!=row['mediacaption']:
        row['mediacaption']=re.sub('\_*','',new_cptn)
    row['mediacaption']=word_tokenize(row['mediacaption'].lower())

In [None]:
# Load user profile data
userinfo = {}    
ufile = open(Vine_data_path+"vine_users_data.json", 'r', encoding='utf-8')
for line in ufile.readlines():
    rr = json.loads(line)
    userinfo[rr['username'].lower()]=rr['description'].lower()

In [None]:
# Construct input session data according to time
session_labels=[]
session_tokens=[]
session_histories=[]
session_times=[]
for row in dataDict:
    if row['question1']=='noneAgg' and row['question2']=='noneBll':
        session_labels.append(0)
    elif row['question1']=='aggression' and row['question2']=='noneBll':
        session_labels.append(0)
    else:
        session_labels.append(1)
    
    row_tokens=[]
    row_times=[]
    row_history=[]
    owner_ut=[row['username']]+row['mediacaption']
    row_tokens.append(owner_ut)
    if row['username'] in userinfo.keys():
        row_history.append(userinfo[row['username']])
    else:
        row_history.append([])
    row_times.append(0)
    for keys in list(row):
        if keys[:4]=='colu':
            row_tokens.append([row[keys][0]]+row[keys][1])
            row_times.append(row[keys][2])
    sorted_row_times=[]
    sorted_row_tokens=[]
    
    mintime=row_times[np.argsort(row_times)[0]]
    for i in np.argsort(row_times):
        if mintime<0:
            sorted_row_times.append(row_times[i]-mintime)
        else:
            sorted_row_times.append(row_times[i])
        sorted_row_tokens.append(row_tokens[i])
    
    sorted_row_times=sorted_row_times[:MAX_SENTS]+(MAX_SENTS-len(sorted_row_times))*[0]
    session_times.append(sorted_row_times)
    session_tokens.append(sorted_row_tokens)
    session_histories.append(row_history)

In [None]:
# Tokenize input sessions
embedding_index,lister=generate_indices_embedding(session_tokens)
session_indices=generate_word_index(session_tokens,embedding_index)
session_history_indices=generate_history_word_index(session_histories,embedding_index)

session_indices=np.array(session_indices)
session_times=np.array(session_times)
session_labels=np.array(session_labels)
session_history_indices=np.array(session_history_indices)
session_times=np.expand_dims(session_times, -1)

## Model

In [None]:
class lambdaLayer(Layer):
    def __init__(self, **kwargs):
        super(lambdaLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        super(lambdaLayer, self).build(input_shape)  

    def call(self, x):
        result =x[0]*x[2]+x[1]*x[3]
        return result

    def compute_output_shape(self, input_shape):
        return input_shape[2]
class TimeAtt(Layer):
    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(TimeAtt, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        
        super(TimeAtt, self).build(input_shape)
        
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
                
    def call(self, x):
        if len(x) == 4:
            Q_seq,K_seq,V_seq,T_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        T1_seq = K.repeat_elements(T_seq,K.int_shape(T_seq)[1],2)
        T2_seq = K.permute_dimensions(T1_seq, (0,2,1))
        T1_seq = K.reshape(T1_seq, (-1,1,K.shape(T1_seq)[1],K.shape(T1_seq)[2]))
        T2_seq = K.reshape(T2_seq, (-1,1,K.shape(T2_seq)[1],K.shape(T2_seq)[2]))
        T_seq = add([T1_seq, -T2_seq])
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))
        A = add([T_seq, A])
        A = K.softmax(A)
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)
def slice(x,index):
    return x[:,index,:]
class scoreHistory(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        true = test_session_labels
        predictions = model.predict([test_session_indices,test_session_times,test_session_history_indices], batch_size=32, verbose=1)
        auc=roc_auc_score(true, predictions[:,1])
        predictions = np.argmax(predictions, axis=1)
        cr = classification_report(true, predictions,digits=4)
        acc_score=accuracy_score(true, predictions)
        print(cr)
        if times not in results.keys():
            results[times]=[float(cr.split()[10]),float(cr.split()[11]),float(cr.split()[12]),auc,0]
        else:
            if float(cr.split()[12])>results[times][2]:
                results[times]=[float(cr.split()[10]),float(cr.split()[11]),float(cr.split()[12]),auc,epoch]

In [None]:
results={}
for times in range(MAX_TIMES):
    # Shuffle data
    indices = np.arange(len(session_indices))
    np.random.shuffle(indices)

    train_session_indices=session_indices[indices[:int(train_valtest_split*len(indices))]]
    train_session_times=session_times[indices[:int(train_valtest_split*len(indices))]]
    train_session_labels=session_labels[indices[:int(train_valtest_split*len(indices))]]
    train_session_history_indices=session_history_indices[indices[:int(train_valtest_split*len(indices))]]

    test_session_indices=session_indices[indices[int(train_valtest_split*len(indices)):]]
    test_session_times=session_times[indices[int(train_valtest_split*len(indices)):]]
    test_session_labels=session_labels[indices[int(train_valtest_split*len(indices)):]]
    test_session_history_indices=session_history_indices[indices[int(train_valtest_split*len(indices)):]]
        
    title_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    wordEmb=Embedding(len(lister),400, weights=[lister],trainable=True)
    titles = wordEmb(title_input)
    d_titles=Dropout(0.2)(titles)
    sentenceGRU=Bidirectional(GRU(32,return_sequences=True))
    title_attrep=sentenceGRU(d_titles)
    dense1=Dense(32,activation='tanh')
    dense2=Dense(1)
    attention = dense1(title_attrep)
    attention = Flatten()(dense2(attention))
    attention_weight = Activation('softmax')(attention)
    title_att=keras.layers.Dot((1, 1))([title_attrep, attention_weight])
    sentEncodert = Model(title_input,title_att)
    
    history_input = Input((HISTORY_LENGTH,), dtype='int32')
    histories = wordEmb(history_input)
    d_histories=Dropout(0.2)(histories)
    his_attrep=sentenceGRU(d_histories)
    hattention = dense1(his_attrep)
    hattention = Flatten()(dense2(hattention))
    hattention_weight = Activation('softmax')(hattention)
    hisT_att=keras.layers.Dot((1, 1))([his_attrep, hattention_weight])
    histEncodert = Model(history_input,hisT_att)
    
    session_input = Input((MAX_SENTS,MAX_SENT_LENGTH,), dtype='int32')
    session_encoded= TimeDistributed(sentEncodert)(session_input)
    session_time_input = Input((MAX_SENTS,1), dtype='float32')
    time_dense1=Dense(1,bias=True,activation='sigmoid')
    time_encoded=time_dense1(session_time_input)
    session_history_input = Input((MAX_SENTS,HISTORY_LENGTH,), dtype='int32')
    session_history_encoded= TimeDistributed(histEncodert)(session_history_input)
    d_session_encoded=Dropout(0.2)(session_encoded)
    d_session_history_encoded=Dropout(0.2)(session_history_encoded)
    session_rep=Bidirectional(GRU(64,return_sequences=True))(d_session_encoded)
    rsession_rep=TimeAtt(1,64)([session_rep,session_rep,session_rep,time_encoded])
    his_dense=Dense(1,bias=True)
    w_rsession_rep=his_dense(rsession_rep)
    w_d_session_history_encoded=his_dense(d_session_history_encoded)
    lamb=lambdaLayer()
    rsession_rep=lamb([w_rsession_rep,w_d_session_history_encoded,rsession_rep,d_session_history_encoded])
    satt= Dense(32,activation='tanh')(rsession_rep)
    satt= Flatten()(Dense(1)(satt))
    sattention_weight = Activation('softmax')(satt)
    session_vec=keras.layers.Dot((1, 1))([rsession_rep, sattention_weight])
    predict_vec=Dense(2,activation='softmax')(session_vec)
    model = Model([session_input,session_time_input,session_history_input],predict_vec)
    model.compile(loss=['categorical_crossentropy'], optimizer=Adam(lr=0.001), metrics=['acc'])
    scorehistory = scoreHistory()
    model.fit([train_session_indices,train_session_times,train_session_history_indices],to_categorical(train_session_labels),batch_size=BATCH_SIZE,
              callbacks=[scorehistory],epochs=MAX_EPOCHS,shuffle=True,class_weight={0:1,1:1})