In [0]:
"""
    References:  https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
                 https://github.com/keras-team/keras/blob/master/examples
                 http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745
                 https://github.com/airalcorn2/Recurrent-Convolutional-Neural-Network-Text-Classifier/blob/master/recurrent_convolutional_keras.py
"""
import os
import re
import nltk
import shutil
import math
nltk.download('stopwords')
nltk.download('punkt')
from google.colab import files
import urllib.request
import numpy as np
import pandas as pd
import os
import string
import keras
from google.colab import drive
from sklearn.metrics import accuracy_score
from keras.models import load_model
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.initializers import Constant
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras import backend
from keras.optimizers import Adamax, Adam, Nadam
from keras.layers import BatchNormalization, AveragePooling1D, GlobalAveragePooling1D
from keras.layers import Conv1D, Dense, Input, Lambda, CuDNNLSTM, CuDNNGRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPooling1D, MaxPooling1D, Dropout, SpatialDropout1D
from keras.layers.merge import concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

### IMPORT FILES FROM GOOGLE DRIVE

In [0]:
file_paths = ['test.csv', 'valid.csv', 'train.csv', 'glove.6B.100d.txt']
# Connect to Google Drive
drive.mount('/content/gdrive')
base_url = 'gdrive/My Drive/'

for path in file_paths:
    if not os.path.isfile(path):
        try:
            shutil.copy(base_url + "data/" + path, path)
        except:
            os.mkdir(base_url + "data")
            os.mkdir(base_url + "data/valid_prediction")
            os.mkdir(base_url + "data/test_prediction")
            print(path + " Not Found in Google Drive")

### DOWNLOAD FILES If NO FILES ARE FOUND IN GOOGLE DRIVE

In [0]:
test_path = "https://firebasestorage.googleapis.com/v0/b/hyungyu415.appspot.com/o/test.csv?alt=media&token=979d05cf-d252-4d15-8ec4-a8ed556c6fe9"
valid_path = "https://firebasestorage.googleapis.com/v0/b/hyungyu415.appspot.com/o/valid.csv?alt=media&token=deb7ef2d-1442-48b1-8e17-153aff0bed94"
train_path = "https://firebasestorage.googleapis.com/v0/b/hyungyu415.appspot.com/o/train.csv?alt=media&token=a2f73b7a-f471-4f4a-a15c-57714a6870b2"
embedding_path = "https://firebasestorage.googleapis.com/v0/b/ryan-blog415.appspot.com/o/glove.6B.100d.txt?alt=media&token=e234f3ae-f47a-45ec-a303-159ea4111fbb"
download_paths = [test_path, valid_path, train_path, embedding_path]
file_link_dict = dict(zip(file_paths, download_paths))
for path in file_link_dict.keys():
    if not os.path.isfile(path):
        filedata = urllib.request.urlopen(file_link_dict[path])
        datatowrite = filedata.read()

        with open(path, 'wb') as f:  
            f.write(datatowrite)
    else:
        print("Already Has %s" % path)

def filter_text(string):
    string = str(string)
    string = re.sub('\[[^]]*\]', '', string)
    string = string.replace('\n', ' ')
    return string

file_names = ['valid.csv', 'test.csv', 'train.csv']
for file_name in file_names:
    df = pd.read_csv(file_name)
    df['text'] = df['text'].apply(filter_text)
    extra = df['cool'] + df['funny'] + df['useful']
    df['extra'] = extra
    df.drop([i for i in df.columns if i not in ['extra', 'stars', 'text']], axis=1)
    df.to_csv(file_name, index=False)
    
for path in file_paths:
    shutil.copy(path, base_url + "data/" + path)

### READ FILES FOR EMBEDDING AND DATA

In [0]:
MAX_SEQUENCE_LENGTH = 250
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 20000/120000

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join('glove.6B.100d.txt'), encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels = []  # list of labels
extras = []

files = ['train.csv', 'valid.csv', 'test.csv']

for file_name in files:
    file = pd.read_csv(file_name)
    for i in range(len(file['text'])):
        if math.isnan(file['stars'][i]) or math.isnan(file['extra'][i]):
            continue
        texts.append(str(file['text'][i]))
        labels.append(int(file['stars'][i]))
        extras.append(int(file['extra'][i]))

VALIDATION_SPLIT = 20000/len(texts)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

_, y_val_raw = train_test_split(labels, test_size=VALIDATION_SPLIT, shuffle=False)
y_val_raw, _ = train_test_split(y_val_raw, test_size=0.5, shuffle=False)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT / 2 * data.shape[0])
num_test_samples = num_validation_samples

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=VALIDATION_SPLIT, shuffle=False)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, shuffle=False)
x_extra, x_val_extra = train_test_split(extras, test_size=VALIDATION_SPLIT, shuffle=False)
x_val_extra, x_test_extra = train_test_split(x_val_extra, test_size=0.5, shuffle=False)

print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### ADD EXTRA FEATURE (COOL + USEFUL + FUNNY)

In [0]:
x_val_w_extra = x_val
x_w_extra = x_train
x_test_w_extra = x_test
feature_weight = 10
if check is None:
    check = True

x_val_w_extra = list(x_val_w_extra)
for j in range(feature_weight):
    for i, each in enumerate(x_val_w_extra):
        each = list(each)
        each.append(x_val_extra[i])
        x_val_w_extra[i] = np.array(each)
x_val_w_extra = np.array(x_val_w_extra)

x_w_extra = list(x_w_extra)
for j in range(feature_weight):
    for i, each in enumerate(x_w_extra):
        each = list(each)
        each.append(x_extra[i])
        x_w_extra[i] = np.array(each)
x_w_extra = np.array(x_w_extra)

x_test_w_extra = list(x_test_w_extra)
for j in range(feature_weight):
    for i, each in enumerate(x_test_w_extra):
        each = list(each)
        each.append(x_test_extra[i])
        x_test_w_extra[i] = np.array(each)
x_test_w_extra = np.array(x_test_w_extra)

if check:
    MAX_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH + feature_weight
    check = False
x_test_w_extra.shape

### CREATE MODEL BASED ON VALIDATION ACCURACY FOR SPECIFIED EPOCH COUNT AND SAVE TO GOOGLE DRIVE

#### EPOCH = 15 (Default)
#### 5 Iterations (Default) --> 5 model creation total

In [0]:
epoch = 15
for i in range(20, 25):
    i = "0%d" % i if i < 10 else str(i)
    print('Training %s' % i)
    
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    embedded_sequences = embedding_layer(sequence_input)

    x = Dropout(0.3)(embedded_sequences)
    x = BatchNormalization()(x)
    x = Conv1D(128, 5, padding='valid', activation='relu', strides=1)(x)
    x = MaxPooling1D(5)(x)
    x = Bidirectional(CuDNNLSTM(units=150, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(units=150))(x)
    x = Dropout(0.3)(x)

    preds = Dense(units=6, input_dim=150, activation='softmax')(x)

    model_name = "model%s.h5" % i
#     checkpoint = ModelCheckpoint(model_name, monitor='val_acc', verbose=0,
#                                  save_best_only=True, mode='max')
    model = Model(inputs=sequence_input, outputs=preds)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(x_w_extra, y_train, initial_epoch=0, # callbacks=[checkpoint],
              batch_size=250, epochs=epoch, verbose=2, validation_split=0.1)
    model.save(model_name)
    shutil.copy(model_name, base_url + "model_2/" + model_name)
#     print(model.evaluate(x_val_w_extra, y_val))

### EVALUATE ON MODELS GENERATED

In [0]:
# def find_max_index(some_list):
#     maximum = -100
#     index = -100
#     for i, v in enumerate(some_list):
#         if maximum < v:
#             index = i
#             maximum = v
#     return index

# models = [i for i in os.listdir() if i.endswith(".h5")]
# models.sort()
# print(models)
# top_list = list()
# test_preds = None
# preds = None
# for model_name in models:
#     print("Loading %s" % model_name)
#     model = load_model(model_name)
#     pred = model.predict(x_val_w_extra)
#     test_pred = model.predict(x_test_w_extra)
#     if preds is None:
#         preds = pred
#     else:
#         preds = preds + pred
#     if test_preds is None:
#         test_preds = test_pred
#     else:
#         test_preds = test_preds + test_pred
    
# for sub_list in preds:
#     index = find_max_index(sub_list)
#     top_list.append(index)
# acc_score = accuracy_score(y_val_raw, top_list)
# print("Accuracy: %.4f" % acc_score)

# top_list = list()
# for sub_list in test_preds:
#     index = find_max_index(sub_list)
#     top_list.append(index)
# print(top_list)