<a href="https://colab.research.google.com/github/danm91/le_kingmakers/blob/seb_dl_models/notebooks/seb_dl_word_2_vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Notebook for DL Model; using i) word2vec with RNN, ii) CNN

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from preproc_text import process_tweets
from preproc_abbv import abbreviations
from preproc_class import TextPreprocess

# from danm91.le_kingmakers.le_kingmakers.preproc_text import process_tweets
# from le_kingmakers.preproc_abbv import abbreviations
# from le_kingmakers.preproc_class import TextPreprocess
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

## i) Word2Vec with RNN

### Importing Data

In [None]:
# importing data from hard disk
csv_path = os.path.join('/home/sbyhung/code/danm91/le_kingmakers/raw_data','training.1600000.processed.noemoticon.csv')
df = pd.read_csv(csv_path, header=None)
df = df.copy()

In [None]:
# get sample for testing
sample_size = int(df.shape[0] * 0.005)
data_sample = df.sample(sample_size, random_state=0)
data_sample.shape

(8000, 6)

In [None]:
# splitting data into X & y
X = data_sample.iloc[:, 5]
y = data_sample.iloc[:, 0]
display(X.shape)
display(y.shape)

(8000,)

(8000,)

In [None]:
# changing positive values from 4 to 1
y = y.map({0: 0, 4:1})

In [None]:
# cleaning with bespoke classes
X = X.apply(process_tweets)

In [None]:
# splitting data train:test:val = 60:20:20
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=0, stratify=y_temp)
display(X_train.shape)
display(X_test.shape)
display(X_val.shape)
display(y_train.shape)
display(y_test.shape)
display(y_val.shape)

(4800,)

(1600,)

(1600,)

(4800,)

(1600,)

(1600,)

### Processing Data

#### Preprocess Data

In [None]:
# tokenise words
X_train = X_train.apply(word_tokenize)
X_test = X_test.apply(word_tokenize)
X_val = X_val.apply(word_tokenize)

#### Training with Word2Vec

In [None]:
### setting parameters:
# min_count=1 so words myst appear at least 2 times (to exclude typos)
# vector_size=50 to control the size of the embedding space

word2vec = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=2)


#### Converting data to feed into RNN

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)
X_val_embed = embedding(word2vec, X_val)



# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)
X_val_pad = pad_sequences(X_val_embed, dtype='float32', padding='post', maxlen=200)

In [None]:
# TEST ME
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

### Defining Model

In [None]:
# defining model

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(35, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

### Fitting Model

In [None]:
# initialize model
model = init_model()

In [None]:
# fitting model

es = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=20,
          validation_data=(X_val_pad, y_val),
          callbacks=[es]
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<tensorflow.python.keras.callbacks.History at 0x7fb06b8b9ca0>

In [None]:
res = model.evaluate(X_test_pad, y_test, verbose=0)
res

[0.6711111664772034, 0.5874999761581421]

## ii) Using CNN

### Data

In [None]:
# Use the same data fitted on Word2Vec
display(X_train_pad.shape)
display(X_test_pad.shape)
display(X_val_pad.shape)

(4800, 200, 100)

(1600, 200, 100)

(1600, 200, 100)

### Model

#### Defining model

In [None]:
def init_cnn_model_2():
    model = Sequential()
    model.add(layers.Conv1D(64, kernel_size = 3))
    model.add(layers.Conv1D(32, kernel_size = 3))
    model.add(layers.Conv1D(16, kernel_size = 3))
    model.add(layers.Flatten())
    model.add(layers.Dense(16,activation='relu'))
    model.add(layers.Dense(8,activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_cnn_2 = init_cnn_model_2()



#### Fitting and printing

In [None]:

es_2 = EarlyStopping(patience=10, restore_best_weights=True)

history = model_cnn_2.fit(X_train_pad, y_train, 
          epochs=50, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es_2]
         )


res = model_cnn_2.evaluate(X_test_pad, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
The accuracy evaluated on the test set is of 57.625%
