In [1]:
import pandas as pd
import numpy as np
import math
import time
import re

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

import tensorflow as tf

## 1. Bot level detection

In [2]:
df_train = pd.read_csv('../data/set-3/train/profile_info.csv').iloc[:, 1:]
df_label = pd.read_csv('../data/set-3/train/label.csv').iloc[:, 1:]
df_train = df_train.merge(df_label, on='ID')
df_train.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,17461978,SHAQ,SHAQ,"Orlando, FL","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",http://www.ShaqFuRadio.com,False,15349596,692,45568,...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False,0
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
2,17685258,Brad Parscale,parscale,Florida,Owner @ Parscale Strategy. Senior Advisor Digi...,http://www.parscale.com,False,762839,475,3201,...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False,0
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",Bringing you the important stuff like breaking...,http://www.FOX13news.com,False,327587,4801,1744,...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False,0
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,https://music.apple.com/us/artist/vonte-the-pl...,False,13324,647,44,...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False,1


In [3]:
def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'default_profile',
        'geo_enabled',
        'profile_use_background_image',
        'verified',
        'protected',
        'label'
    ]
    df_return = df[used_columns].fillna(0.0)
    def bool_to_int(text):
        if 'True' in text:
            return 1
        elif 'False' in text:
            return 0
        else:
            return text
    for i in df_return.select_dtypes('object'):
        df_return[i] = df_return[i].apply(bool_to_int)
    return df_return

In [4]:
feature_time = time.time()
df_train = feature_engineering(df_train)
end_feature_time = time.time()

In [5]:
X_train = df_train.drop('label', axis=1).values
y_train = df_train['label'].values
X_train

array([[    9798, 15349596,      692, ...,        1,        1,        0],
       [       0,        0,       44, ...,        1,        0,        0],
       [    5518,   762839,      475, ...,        0,        1,        0],
       ...,
       [    1439,      342,      849, ...,        1,        0,        0],
       [     674,       72,      367, ...,        1,        0,        0],
       [    4842,     5120,      351, ...,        1,        0,        0]],
      dtype=int64)

In [6]:
start_time = time.time()

smote = SMOTE()
smote_X, smote_y = smote.fit_resample(X_train, y_train)

e = EditedNearestNeighbours()
r_X, r_y = e.fit_resample(smote_X, smote_y)

a = AdaBoostClassifier(n_estimators=500, random_state=0)
a.fit(r_X, r_y)

end_time = time.time()

In [7]:
y_predict = a.predict(X_train)

In [8]:
print(classification_report(y_predict, y_train, digits=4))

              precision    recall  f1-score   support

           0     0.9182    0.5663    0.7006      5889
           1     0.4503    0.8757    0.5947      2389

    accuracy                         0.6556      8278
   macro avg     0.6843    0.7210    0.6476      8278
weighted avg     0.7832    0.6556    0.6700      8278



In [9]:
# ROC AUC score
roc_auc_score(y_predict, y_train)

0.7209951352711077

In [10]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(4.498473405838013, 0.17152786254882812)

In [11]:
df_test = pd.read_csv('../data/set-3/test/profile_info.csv').iloc[:, 1:]
df_label_test = pd.read_csv('../data/set-3/test/label.csv').iloc[:, 1:]
df_test = df_test.merge(df_label_test, on='ID')
df_test.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,1188812492010487808,Sharon Israel ⭐️⭐️⭐️,SharonIsrael10,Los Angeles & Colorado,Day 1 Trump supporter. I rode the escalator! C...,,False,16596,16944,1,...,https://pbs.twimg.com/profile_images/118883642...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
1,155659213,Cristiano Ronaldo,Cristiano,"Turim, Piemonte",This Privacy Policy addresses the collection a...,http://www.facebook.com/cristiano,False,87313765,50,83703,...,https://pbs.twimg.com/profile_images/115731332...,1643C9,FFFFFF,838387,0D0D0D,True,False,False,False,0
2,147725246,FoxNewsInsider,FoxNewsInsider,NYC,Stay connected with everything Fox - the lates...,http://insider.foxnews.com,False,161827,361,1471,...,https://pbs.twimg.com/profile_images/881932020...,0084B4,FFFFFF,DDEEF6,333333,True,False,False,False,0
3,1296248637194895360,El Realista,ElReali03271594,Puerto Rico,Aprendizaje. Pensamiento Crítico. Debate de id...,,False,9,543,0,...,https://pbs.twimg.com/profile_images/129624930...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
4,1339835893,Hillary Clinton,HillaryClinton,"New York, NY","2016 Democratic Nominee, SecState, Senator, ha...",http://onwardtogether.org,False,28513011,846,40146,...,https://pbs.twimg.com/profile_images/129119233...,0057B8,000000,000000,000000,False,True,False,False,0


In [12]:
df_test = feature_engineering(df_test)
X_test = df_test.drop('label', axis=1).values
y_test = df_test['label'].values
X_test

array([[   49757,    16596,    16944, ...,        1,        0,        0],
       [    3569, 87313765,       50, ...,        1,        1,        0],
       [   73786,   161827,      361, ...,        1,        1,        0],
       ...,
       [    2950,      309,     1961, ...,        0,        0,        0],
       [     152,      154,     1019, ...,        1,        0,        0],
       [     208,       68,      927, ...,        0,        0,        0]],
      dtype=int64)

In [13]:
y_test_predict = a.predict(X_test)

In [14]:
print(classification_report(y_test_predict, y_test, digits=4))

              precision    recall  f1-score   support

           0     0.8729    0.5563    0.6796       852
           1     0.4094    0.7915    0.5396       331

    accuracy                         0.6221      1183
   macro avg     0.6412    0.6739    0.6096      1183
weighted avg     0.7432    0.6221    0.6404      1183



In [15]:
roc_auc_score(y_test_predict, y_test)

0.6739394068337516

## 2. Tweet level detection

In [16]:
df_tweet = pd.read_csv('../data/set-3/train/tweet.csv').iloc[:, 1:]
df_tweet = df_tweet.merge(df_label, on='ID')
df_tweet.head()

Unnamed: 0,ID,tweet,label
0,17461978,RT @CarnivalCruise: 🎉 Are you ready to see wha...,0
1,17461978,Who has time for receipts? Not me. @epson rece...,0
2,17461978,Steady wants to encourage you to invest in you...,0
3,17461978,"Good one, @rishid. But let’s see if y'all can ...",0
4,17461978,#lsunationalchamps\n,0


In [17]:
df_tweet_dev = pd.read_csv('../data/set-3/dev/tweet.csv').iloc[:, 1:]
df_label_dev = pd.read_csv('../data/set-3/dev/label.csv').iloc[:, 1:]
df_tweet_dev = df_tweet_dev.merge(df_label_dev)
df_tweet_dev.head()

Unnamed: 0,ID,tweet,label
0,1224667050301255680,@SparklesOnlyme পুরোনো এইদিনের কথা\n,0
1,1224667050301255680,@BariraJahan হায়\n,0
2,1224667050301255680,সেদিন রাস্তার ধারে নুনু চুলাকাচ্ছিলাম।\n\nকে জ...,0
3,1224667050301255680,"নিজের বলতে কিছু নাইরে মাদারচোদ,\n\nসালার নুনু ...",0
4,1224667050301255680,ফোন টিপতে টিপতেই জীবন শেষ হবে অন্যকিছু আর টিপত...,0


In [18]:
URL_PATTERN = "^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$"

def isAllCaps(word):
    for c in word:
        if c.islower() or not c.isalpha():
            return False
    return True

def hasRepeatedLetters(word):
    prev = ''
    prev2 = ''
    for c in word:
        if c == prev:
            if c == prev2:
                return True
        prev2 = prev
        prev = c
    return False

# In the paper, the tags will be denote as <hashtag>, for example
# but for convenience for the nltk's word_tokenizer, we will change
# the <tag> to tagtag (<url> -> urltag)
def text_tags(row):
    rowlist = str(row).split()
    rowlist = [word if not word.strip().startswith(
        '#') else "hashtagtag" for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '@') else "usertag" for word in rowlist]
    rowlist = [word if not isAllCaps(
        word.strip()) else word.lower() + " allcapstag" for word in rowlist]
    rowlist = [word if not hasRepeatedLetters(
        word.strip()) else word + " repeatedtag" for word in rowlist]
    rowlist = [word.lower() for word in rowlist]
    rowlist = [re.sub(URL_PATTERN, "urltag", word) for word in rowlist]
    return " ".join(rowlist)

In [19]:
df_tweet["text_processed"] = df_tweet["tweet"].apply(text_tags)
df_tweet["text_processed"][:5]

0    rt allcapstag usertag 🎉 are you ready to see w...
1    who has time for receipts? not me. usertag rec...
2    steady wants to encourage you to invest in you...
3    good one, usertag but let’s see if y'all can d...
4                                           hashtagtag
Name: text_processed, dtype: object

In [20]:
df_tweet_dev["text_processed"] = df_tweet_dev["tweet"].apply(text_tags)
df_tweet_dev["text_processed"][:5]

0                           usertag পুরোনো এইদিনের কথা
1                                         usertag হায়
2    সেদিন রাস্তার ধারে নুনু চুলাকাচ্ছিলাম। কে জানি...
3    নিজের বলতে কিছু নাইরে মাদারচোদ, সালার নুনু টাও...
4    ফোন টিপতে টিপতেই জীবন শেষ হবে অন্যকিছু আর allc...
Name: text_processed, dtype: object

In [21]:
X_train = df_tweet["text_processed"]
X_dev = df_tweet_dev["text_processed"]

In [30]:
y_train = df_tweet["label"].values
y_dev = df_tweet_dev["label"].values

In [22]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(5000)

tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index

In [23]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map

In [24]:
# word_to_vec_map = read_glove_vector('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/glove/glove.twitter.27B.50d.txt')
word_to_vec_map = read_glove_vector('../glove/glove.twitter.27B.50d.txt')

In [25]:
maxLen = 500
embed_vector_len = 50
vocab_len = len(words_to_index)

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index-1, :] = embedding_vector

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_len,
    output_dim=embed_vector_len,
    input_length=maxLen,
    weights = [emb_matrix],
    trainable=False
)

In [26]:
def lstm_glove_model(input_shape):
    X_indices = tf.keras.Input(input_shape)
    embeddings = embedding_layer(X_indices)
    lstm = tf.keras.layers.LSTM(32, return_sequences=True)(embeddings)
    flatten = tf.keras.layers.Flatten()(lstm)
    dense_1 = tf.keras.layers.Dense(1024)(flatten)
    dense_2 = tf.keras.layers.Dense(256)(dense_1)
    dense_3 = tf.keras.layers.Dense(256)(dense_2)
    dense_4 = tf.keras.layers.Dense(256)(dense_3)
    dense_5 = tf.keras.layers.Dense(256)(dense_4)
    dense_6 = tf.keras.layers.Dense(128)(dense_5)
    dense_7 = tf.keras.layers.Dense(128)(dense_6)
    dense_8 = tf.keras.layers.Dense(64)(dense_7)
    dense_9 = tf.keras.layers.Dense(64)(dense_8)
    dense_10 = tf.keras.layers.Dense(32)(dense_9)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_10)
    
    model = tf.keras.Model(inputs=X_indices, outputs=output)
    
    return model

In [27]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_dev_indices = tokenizer.texts_to_sequences(X_dev)

X_train_indices = tf.keras.preprocessing.sequence.pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_dev_indices = tf.keras.preprocessing.sequence.pad_sequences(X_dev_indices, maxlen=maxLen, padding='post')

In [31]:
model = lstm_glove_model(input_shape=(maxLen,))
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_indices, y_train, batch_size=256, epochs=50, validation_data=[X_dev_indices, y_dev])

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 50)           34708050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 500, 32)           10624     
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              16385024  
_________________________________________________________________
dense_12 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_13 (Dense)             (None, 256)              

KeyboardInterrupt: 