## Neural Nets Experiments with Keras

In [1]:
# set paths
import os
os.chdir('../../')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(os.getcwd())

import sys
sys.path.append('.')

e:\OneDriveLocal\OneDrive\学习\Graduate Study\2021Winter\twitter-nlp


In [16]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import spacy

import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Embedding, Dropout, Activation, Softmax, Bidirectional
from keras import Sequential

from src.utils.submission import prediction_output
from src.utils.preprocessing import TextNormalizer, generate_vocabulary, encode_document

In [3]:
tf.config.experimental.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### 1. Data Loading and Preprocessing

In [4]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

rules = {r"[.!?#@÷>\+\-\*/':;(),\|\[\]_]+|[\x89Û|\x89Ó|\x89Ò|\x89|åê]+|http://t.co/[A-Za-z0-9]+|https://t.co/[A-Za-z0-9]+|\&gt|\&amp": ' ',
         r'\n': ' ',
         r'[ ]+': ' '}

text_normalizer = TextNormalizer(rules=rules)

df_train['text_clean'] = text_normalizer.clean(df_train['text'])
df_test['text_clean'] = text_normalizer.clean(df_test['text'])

y_train = df_train['target']
label_encoder = OneHotEncoder()
y_train = label_encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()

## train-validation split
text_train, text_val, y_train, y_val = train_test_split(df_train['text_clean'], y_train, test_size=0.3)
text_test = np.array(df_test['text_clean'])


In [5]:
## creating vocabulary index list and dictionary
nlp = spacy.load('en_core_web_lg')
word2ind, ind2word, ind2vec = generate_vocabulary(text_train, nlp)

In [6]:
encoded_train = np.array([encode_document(document, word2ind) for document in text_train])
encoded_val = np.array([encode_document(document, word2ind) for document in text_val])
encoded_test = np.array([encode_document(document, word2ind) for document in text_test])

In [7]:
max_length = max(max([len(doc) for doc in encoded_train]),
                 max([len(doc) for doc in encoded_val]),
                 max([len(doc) for doc in encoded_test]))

padded_train = pad_sequences(encoded_train, maxlen=max_length)
padded_val = pad_sequences(encoded_val, maxlen=max_length)
padded_test = pad_sequences(encoded_test, maxlen=max_length)

In [8]:
ind2vec_array = np.array(list(ind2vec.values()))

### 2. Model Building

In [17]:
model = Sequential()

embed_input_dim = len(word2ind)
embed_output_dim = len(ind2vec[0])

model.add(Embedding(input_dim=embed_input_dim,
                    output_dim=embed_output_dim,
                    input_length=max_length,
                    weights=[ind2vec_array]))
model.add(tf.compat.v1.keras.layers.CuDNNLSTM(128, regul))
model.add(Dropout(0.5))
model.add(Dense(128))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 34, 300)           4042200   
                                                                 
 bidirectional (Bidirectiona  (None, 100)              140800    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_9 (Dense)             (None, 128)               12928     
                                                                 
 dense_10 (Dense)            (None, 2)                 258       
                                                                 
 activation_2 (Activation)   (None, 2)                 0         
                                                      

In [18]:
model.fit(padded_train, y_train, batch_size=32, epochs=50,
          validation_data=(padded_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1efe3aea6c8>