In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
df_bot = pd.concat(
    [
        pd.read_csv('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/data/set-2/tweets/social_spambots_1.csv', nrows=20000),
        pd.read_csv('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/data/set-2/tweets/social_spambots_2.csv', nrows=20000),
        pd.read_csv('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/data/set-2/tweets/social_spambots_3.csv', nrows=20000)
    ]
).reset_index(drop=True)

In [4]:
df_naive = pd.read_csv('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/data/set-2/tweets/tweets.csv', header=None, escapechar='\\', nrows=60000)

In [5]:
df_naive.drop(12, axis=1, inplace=True)
df_naive.columns = df_bot.columns

In [6]:
df = pd.concat([df_bot, df_naive], ignore_index=True)
label = y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)
df.head()

Unnamed: 0,id,text,source,user_id,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,retweeted_status_id,geo,place,contributors,retweet_count,reply_count,favorite_count,favorited,retweeted,possibly_sensitive,num_hashtags,num_urls,num_mentions,created_at,timestamp,crawled_at,updated
0,532627591686275072,I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,,,0,0,0,,,,0,1,0,Wed Nov 12 20:14:48 +0000 2014,2014-11-12 21:14:48,2014-11-12 21:44:09,2014-11-12 21:44:09
1,532624255058706432,http://t.co/HyI5EQKz6Q,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,,,0,0,0,,,,0,1,0,Wed Nov 12 20:01:32 +0000 2014,2014-11-12 21:01:32,2014-11-12 21:44:09,2014-11-12 21:44:09
2,532513524460052480,"Tutti a tavola, con il filetto di baccalà. htt...","<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,,,0,0,0,,,,0,1,0,Wed Nov 12 12:41:32 +0000 2014,2014-11-12 13:41:32,2014-11-12 21:44:09,2014-11-12 21:44:09
3,532297646669852672,http://t.co/NAHQ4l2pUy,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,,,0,0,0,,,,0,1,0,Tue Nov 11 22:23:43 +0000 2014,2014-11-11 23:23:43,2014-11-12 21:44:09,2014-11-12 21:44:09
4,532295960807100416,Gold - Spandau Ballet http://t.co/o8ZJHt7Neu,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,,,0,0,0,,,,0,1,0,Tue Nov 11 22:17:01 +0000 2014,2014-11-11 23:17:01,2014-11-12 21:44:09,2014-11-12 21:44:09


# Preprocessing

In [7]:
URL_PATTERN = "^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$"

def isAllCaps(word):
    for c in word:
        if c.islower() or not c.isalpha():
            return False
    return True

def hasRepeatedLetters(word):
    prev = ''
    prev2 = ''
    for c in word:
        if c == prev:
            if c == prev2:
                return True
        prev2 = prev
        prev = c
    return False

# In the paper, the tags will be denote as <hashtag>, for example
# but for convenience for the nltk's word_tokenizer, we will change
# the <tag> to tagtag (<url> -> urltag)
def text_tags(row):
    rowlist = str(row).split()
    rowlist = [word.strip() for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '#') else "hashtagtag" for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '@') else "usertag" for word in rowlist]
    rowlist = [word if not isAllCaps(
        word.strip()) else word.lower() + " allcapstag" for word in rowlist]
    rowlist = [word if not hasRepeatedLetters(
        word.strip()) else word + " repeatedtag" for word in rowlist]
    rowlist = [word.lower() for word in rowlist]
    rowlist = [re.sub(URL_PATTERN, "urltag", word) for word in rowlist]
    return " ".join(rowlist)

In [8]:
df["text_processed"] = df["text"].apply(text_tags)
df["text_processed"][:5]

0          i allcapstag pooh - in silenzio 1968 urltag
1                                               urltag
2    tutti a tavola, con il filetto di baccalà. urltag
3                                               urltag
4                         gold - spandau ballet urltag
Name: text_processed, dtype: object

In [9]:
train_idx, test_idx, y_train, y_test = train_test_split(
    np.arange(df.shape[0]),
    y,
    test_size=0.2,
    random_state=0
)
X_train = df["text_processed"][train_idx]
X_test = df["text_processed"][test_idx]

In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)

tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index

# GloVe Embeddings + LSTM

In [11]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map

In [12]:
word_to_vec_map = read_glove_vector('gdrive/MyDrive/Thesis/Thesis Workspace/Notebooks/glove/glove.twitter.27B.50d.txt')

In [13]:
maxLen = 500
embed_vector_len = 50
vocab_len = len(words_to_index)

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index-1, :] = embedding_vector

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_len,
    output_dim=embed_vector_len,
    input_length=maxLen,
    weights = [emb_matrix],
    trainable=False
)

In [14]:
def lstm_glove_model(input_shape):
    X_indices = tf.keras.Input(input_shape)
    embeddings = embedding_layer(X_indices)
    lstm = tf.keras.layers.LSTM(32, return_sequences=True)(embeddings)
    flatten = tf.keras.layers.Flatten()(lstm)
    dense_1 = tf.keras.layers.Dense(1024)(flatten)
    dense_2 = tf.keras.layers.Dense(256)(dense_1)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)
    
    model = tf.keras.Model(inputs=X_indices, outputs=output)
    
    return model

In [15]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = tf.keras.preprocessing.sequence.pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [16]:
model = lstm_glove_model(input_shape=(maxLen,))
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_indices, y_train, batch_size=256, epochs=50)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding (Embedding)       (None, 500, 50)           4138550   
                                                                 
 lstm (LSTM)                 (None, 500, 32)           10624     
                                                                 
 flatten (Flatten)           (None, 16000)             0         
                                                                 
 dense (Dense)               (None, 1024)              16385024  
                                                                 
 dense_1 (Dense)             (None, 256)               262400    
                                                                 
 dense_2 (Dense)             (None, 1)                 257   

<keras.callbacks.History at 0x7fe484a75410>

In [17]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = tf.keras.preprocessing.sequence.pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
model.evaluate(X_test_indices, y_test)



[0.45125237107276917, 0.7954583168029785]

# GloVe Embeddings + Contextual LSTM

In [18]:
def contextual_lstm_glove_model(text_input_shape, metadata_input_shape):
    text = tf.keras.Input(text_input_shape)
    embeddings = embedding_layer(text)
    lstm = tf.keras.layers.LSTM(32, return_sequences=True)(embeddings)
    flatten = tf.keras.layers.Flatten()(lstm)
    
    metadata = tf.keras.Input(metadata_input_shape)
    
    concat = tf.keras.layers.concatenate([flatten, metadata], axis=1)
    
    dense_1 = tf.keras.layers.Dense(1024)(concat)
    dense_2 = tf.keras.layers.Dense(256)(dense_1)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)
    
    model = tf.keras.Model(inputs=[text, metadata], outputs=output)
    
    return model

In [19]:
def get_metadata(df):
    COLUMN_NAMES = ['retweet_count', 'favorite_count',
                    'num_hashtags', 'num_urls', 'num_mentions']
    return df[COLUMN_NAMES].replace('N', np.nan).fillna(0)

In [20]:
df_metadata = get_metadata(df)

In [21]:
mean = df_metadata.mean(axis=0)
std = df_metadata.std(axis=0)

In [22]:
df_metadata = (df_metadata - mean) / std
df_metadata.head()

Unnamed: 0,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions
0,-0.038828,-0.095142,-0.394058,1.267671,-0.703404
1,-0.038828,-0.095142,-0.394058,1.267671,-0.703404
2,-0.038828,-0.095142,-0.394058,1.267671,-0.703404
3,-0.038828,-0.095142,-0.394058,1.267671,-0.703404
4,-0.038828,-0.095142,-0.394058,1.267671,-0.703404


In [23]:
X_meta_train = df_metadata.iloc[train_idx].values
X_meta_test = df_metadata.iloc[test_idx].values

In [24]:
model = contextual_lstm_glove_model(text_input_shape=(maxLen,), metadata_input_shape=(5,))
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit([X_train_indices, X_meta_train], y_train, batch_size=256, epochs=50)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 500, 50)      4138550     ['input_2[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 500, 32)      10624       ['embedding[1][0]']              
                                                                                                  
 flatten_1 (Flatten)            (None, 16000)        0           ['lstm_1[0][0]']                 
                                                                                            

<keras.callbacks.History at 0x7fe48544b1d0>

In [25]:
model.evaluate([X_test_indices, X_meta_test], y_test)



[0.44989725947380066, 0.8054583072662354]

In [26]:
model.predict([X_test_indices, X_meta_test])

array([[0.27726993],
       [0.11961023],
       [0.04404958],
       ...,
       [0.76257765],
       [0.4046005 ],
       [0.97232705]], dtype=float32)