In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
df_bot = pd.concat(
    [
        pd.read_csv('../data/set-2/tweets/social_spambots_1.csv', nrows=10000),
        pd.read_csv('../data/set-2/tweets/social_spambots_2.csv', nrows=10000),
        pd.read_csv('../data/set-2/tweets/social_spambots_3.csv', nrows=10000)
    ]
).reset_index(drop=True)

In [3]:
df_naive = pd.read_csv('../data/set-2/tweets/tweets.csv', header=None, escapechar='\\', nrows=30000)

In [4]:
df_naive.drop(12, axis=1, inplace=True)
df_naive.columns = df_bot.columns

In [5]:
df = pd.concat([df_bot, df_naive], ignore_index=True)
label = y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)
df.head()

Unnamed: 0,id,text,source,user_id,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,retweeted_status_id,geo,...,favorited,retweeted,possibly_sensitive,num_hashtags,num_urls,num_mentions,created_at,timestamp,crawled_at,updated
0,532627591686275072,I Pooh - In silenzio 1968 http://t.co/ahvQxUqTws,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,...,,,,0,1,0,Wed Nov 12 20:14:48 +0000 2014,2014-11-12 21:14:48,2014-11-12 21:44:09,2014-11-12 21:44:09
1,532624255058706432,http://t.co/HyI5EQKz6Q,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,...,,,,0,1,0,Wed Nov 12 20:01:32 +0000 2014,2014-11-12 21:01:32,2014-11-12 21:44:09,2014-11-12 21:44:09
2,532513524460052480,"Tutti a tavola, con il filetto di baccalà. htt...","<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,...,,,,0,1,0,Wed Nov 12 12:41:32 +0000 2014,2014-11-12 13:41:32,2014-11-12 21:44:09,2014-11-12 21:44:09
3,532297646669852672,http://t.co/NAHQ4l2pUy,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,...,,,,0,1,0,Tue Nov 11 22:23:43 +0000 2014,2014-11-11 23:23:43,2014-11-12 21:44:09,2014-11-12 21:44:09
4,532295960807100416,Gold - Spandau Ballet http://t.co/o8ZJHt7Neu,"<a href=""http://www.facebook.com/twitter"" rel=...",24858289,,0,0,,0,,...,,,,0,1,0,Tue Nov 11 22:17:01 +0000 2014,2014-11-11 23:17:01,2014-11-12 21:44:09,2014-11-12 21:44:09


# Preprocessing

In [6]:
URL_PATTERN = "^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$"

def isAllCaps(word):
    for c in word:
        if c.islower() or not c.isalpha():
            return False
    return True

def hasRepeatedLetters(word):
    prev = ''
    prev2 = ''
    for c in word:
        if c == prev:
            if c == prev2:
                return True
        prev2 = prev
        prev = c
    return False

# In the paper, the tags will be denote as <hashtag>, for example
# but for convenience for the nltk's word_tokenizer, we will change
# the <tag> to tagtag (<url> -> urltag)
def text_tags(row):
    rowlist = str(row).split()
    rowlist = [word.strip() for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '#') else "hashtagtag" for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '@') else "usertag" for word in rowlist]
    rowlist = [word if not isAllCaps(
        word.strip()) else word.lower() + " allcapstag" for word in rowlist]
    rowlist = [word if not hasRepeatedLetters(
        word.strip()) else word + " repeatedtag" for word in rowlist]
    rowlist = [word.lower() for word in rowlist]
    rowlist = [re.sub(URL_PATTERN, "urltag", word) for word in rowlist]
    return " ".join(rowlist)

In [7]:
df["text_processed"] = df["text"].apply(text_tags)
df["text_processed"][:5]

0          i allcapstag pooh - in silenzio 1968 urltag
1                                               urltag
2    tutti a tavola, con il filetto di baccalà. urltag
3                                               urltag
4                         gold - spandau ballet urltag
Name: text_processed, dtype: object

# Glove Embeddings + LSTM

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text_processed"].values,
    y,
    test_size=0.2,
    random_state=0
)

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)

tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index

In [10]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map

In [11]:
word_to_vec_map = read_glove_vector('../glove/glove.twitter.27B.50d.txt')

In [12]:
maxLen = 200
embed_vector_len = 50
vocab_len = len(words_to_index)

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index-1, :] = embedding_vector

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_len,
    output_dim=embed_vector_len,
    input_length=maxLen,
    weights = [emb_matrix],
    trainable=False
)

In [16]:
def lstm_glove_model(input_shape):
    X_indices = tf.keras.Input(input_shape)
    embeddings = embedding_layer(X_indices)
    lstm = tf.keras.layers.LSTM(32, return_sequences=True)(embeddings)
    dense_1 = tf.keras.layers.Dense(16)(lstm)
    dense_2 = tf.keras.layers.Dense(8)(dense_1)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)
    
    model = tf.keras.Model(inputs=X_indices, outputs=output)
    
    return model

In [17]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = tf.keras.preprocessing.sequence.pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [18]:
model = lstm_glove_model(input_shape=(maxLen,))
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_indices, y_train, batch_size=256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x230028142e0>

# GloVe Embeddings + Contextual LSTM

In [19]:
def contextual_lstm_glove_model(text_input_shape, metadata_input_shape):
    text = tf.keras.Input(text_input_shape)
    embeddings = embedding_layer(text)
    lstm = tf.keras.layers.LSTM(32, return_sequences=True)(embeddings)
    
    metadata = tf.keras.Input(metadata_input_shape)
    
    concat = tf.keras.layers.concatenate([lstm, metadata])
    
    dense_1 = tf.keras.layers.Dense(16)(concat)
    dense_2 = tf.keras.layers.Dense(8)(dense_1)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)
    
    model = tf.keras.Model(inputs=X_indices, outputs=output)
    
    return model