In [1]:
#imports:
import pandas as pd
import os
import numpy as np
import regex as re
# Packages for data preparation
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import tensorflow as tf

#packages for matrices:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [11]:
from sklearn.model_selection import GroupShuffleSplit

In [3]:
# -----------Loading the data: ---------------#

# 1.open the files and creat the dataframe:

# open the files by panda library :   https://towardsdatascience.com/natural-language-processing-classification-using-deep-learning-and-word2vec-50cbadd3bd6a
hamFiles = pd.read_csv("Datasets_files/g_tweets.csv")
spamFiles1 = pd.read_csv("Datasets_files/sp_soci_2_tweets.csv")
spamFiles2 = pd.read_csv("Datasets_files/sp_tra_1_tweets.csv")

#2.drop all the columns except the tweets text
hamFiles=hamFiles[['text','user_id']]
spamFiles1=spamFiles1[['text','user_id']]
spamFiles2=spamFiles2[['text','user_id']]

# 3.create the labels:
ones = []   # for ham
zeros_1 = []   # for spam1
zeros_2 = []   # for spam2

for i in range(hamFiles.shape[0]):
    ones.append(1)
for i in range(spamFiles1.shape[0]):    
    zeros_1.append(0)
for i in range(spamFiles2.shape[0]):    
    zeros_2.append(0)

hamFiles['label'] = ones
spamFiles1['label'] = zeros_1
spamFiles2['label'] = zeros_2




In [6]:
# 4. split ham files to take only number of tweets = spam tweets avialble:
print(spamFiles1.shape[0])
print(spamFiles2.shape[0])
print(hamFiles.shape[0])
numOfSamples=spamFiles1.shape[0]+spamFiles2.shape[0]
hamFiles, hamFiles_unused = hamFiles[:numOfSamples], hamFiles[numOfSamples:]
print(hamFiles.shape[0])

428542
145094
2839362
573636


In [7]:
# 4. merge all the files in one dataset:
tweets = hamFiles
tweets = tweets.append(spamFiles1)
tweets = tweets.append(spamFiles2)
tweets.head()

Unnamed: 0,text,user_id,label
0,RT @morningJewshow: Speaking about Jews and co...,678033.0,1
1,This age/face recognition thing..no reason pla...,678033.0,1
2,Only upside of the moment I can think of is th...,678033.0,1
3,If you're going to think about+create experien...,678033.0,1
4,Watching a thread on FB about possible future ...,678033.0,1


In [8]:
# -------------- preprocessing the data -------------------------#

# 1. cleaning the tweets the same way the preprocessing done for the pre trained Glove tweets : https://nlp.stanford.edu/projects/glove/

# Functions for the cleaning:    https://gist.github.com/tokestermw/cb87a97113da12acb388
FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()



In [9]:
# start cleaning the tweets: 
tweets['clean_tweets'] = tweets['text'].astype(str).apply(tokenize)        
print(tweets['clean_tweets'].head(100))

0     rt <allcaps> <user>: speaking about jews and c...
1     this age / face recognition thing. <repeat>no ...
2     only upside of the moment i can think of is th...
3     if you're going to think about+create experien...
4     watching a thread on fb <allcaps> about possib...
5                                      don't. ok? <url>
6     rt <allcaps> <user>: <hashtag> enoughsenough "...
7     rt <allcaps> <user>: kriss kross once rapped "...
8     rt <allcaps> <user>: watch baltimore native <u...
9     <user> <user> i didn't realize anyone has ever...
10    rt <allcaps> <user>: here's my one paragraph r...
11    <user> i stand by you. i don't want men tellin...
12                              <user> thank you becca.
13    <user> i dont' ask for rt <allcaps>s gen, but ...
14    not only$ but acts+messages from people that s...
15    to live with untreated ptsd <allcaps> is to fe...
16    <user> is also a columbine survivor. miles awa...
17    <user> is old skool web. a good web citize

In [13]:
# 2.spliting data in level of user ID:    https://stackoverflow.com/questions/54797508/how-to-generate-a-train-test-split-based-on-a-group-id
train_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7).split(tweets, groups=tweets['user_id']))

train = tweets.iloc[train_inds]
test = tweets.iloc[test_inds]

In [19]:
X_train=train['clean_tweets']
X_test=test['clean_tweets']
y_train=train['label']
y_test=test['label']


(901954,)
(901954,)
0         rt <allcaps> <user>: speaking about jews and c...
1         this age / face recognition thing. <repeat>no ...
2         only upside of the moment i can think of is th...
3         if you're going to think about+create experien...
4         watching a thread on fb <allcaps> about possib...
5                                          don't. ok? <url>
6         rt <allcaps> <user>: <hashtag> enoughsenough "...
7         rt <allcaps> <user>: kriss kross once rapped "...
8         rt <allcaps> <user>: watch baltimore native <u...
9         <user> <user> i didn't realize anyone has ever...
10        rt <allcaps> <user>: here's my one paragraph r...
11        <user> i stand by you. i don't want men tellin...
12                                  <user> thank you becca.
13        <user> i dont' ask for rt <allcaps>s gen, but ...
14        not only$ but acts+messages from people that s...
15        to live with untreated ptsd <allcaps> is to fe...
16        <user> is 

In [24]:
# 3.representing tweets as sequences prepairing for Glove embading:

# did not specify max num of words in vocab dic so it will include all
tk= Tokenizer(filters='!"\t\n')

tk.fit_on_texts(X_train) # to build our data vocab
X_train_seq = tk.texts_to_sequences(X_train) # to convert the text into seq of numbers related to the vocab index
X_test_seq = tk.texts_to_sequences(X_test)

#max length of tweet 140 , the updated tweitter is more tho
X_train_seq_trunc = pad_sequences(X_train_seq,maxlen=140) #pad the short tweets so all the seq of same length
X_test_seq_trunc = pad_sequences(X_test_seq,maxlen=140)


# split the training set into training and validation 
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train, test_size=0.2, random_state=37)


In [26]:
# ----------- Embading : 

# 1. load the pretraind Glove to a dictionary : https://towardsdatascience.com/word-embeddings-for-sentiment-analysis-65f42ea5d26e
emb_dict = {}
glove = open("Glove_pretrined/glove.twitter.27B.100d.txt")
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [28]:
# 2.generate the embading matrix for our data from the Glove dictonary:
NB_WORDS = len(tk.word_index)  # Parameter indicating the number of words we'll put in the dictionary, usully I see less but I try now all
GLOVE_DIM= 100;
emb_matrix = np.zeros((NB_WORDS+1, GLOVE_DIM))   # set the defult embading for the words to be zeros
for w, i in tk.word_index.items():
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect    # set the embading for the word to the one from the Glove
    else:
        break

In [30]:
# ------ Creating the model          : https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa
MAX_LEN=140   #the prevuilsly decided length of the max seq of text
model_glove = Sequential()
model_glove.add(Embedding(NB_WORDS+1,100, input_length=MAX_LEN, weights=[emb_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_glove.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 100)          42613700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 140, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 136, 64)           32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 34, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 42,711,865
Trainable params: 98,165
Non-trainable params: 42,613,700
_____________________________________

In [31]:
# Create callback for early stopping on validation loss. If the loss does not decrease in two consecutive tries, stop training.
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

# Fit train data into the model:
###^^^^^^^^^^^^ I see examples with larger epoches$$$$come back to check how to decide this 
history=model_glove.fit(X_train_emb, y_train_emb, validation_data=(X_valid_emb, y_valid_emb),callbacks=callbacks, epochs = 3)
# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_accuracy'][-1], loss=history['val_loss'][-1]))

# Save model.
model_glove.save('Spambot_model.h5')

Train on 721563 samples, validate on 180391 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation accuracy: 0.8906320333480835, loss: 0.2577085436462187


In [32]:
# test by the test data without rebuilding the word embading dic:
model_glove.evaluate(x=X_test_seq_trunc, y=y_test)



[0.4224773896021752, 0.8285979628562927]

In [33]:
#-----Calculating the Recall,perciosion, F score:
# predict probabilities for test set    https://machinelearningmastery.com/how-to-calculate-precision-recall-f1-and-more-for-deep-learning-models/
yhat_probs = model_glove.predict(X_test_seq_trunc, verbose=0)
# predict crisp classes for test set
yhat_classes = model_glove.predict_classes(X_test_seq_trunc, verbose=0)

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.828598
Precision: 0.805593
Recall: 0.864927
F1 score: 0.834206
