In [None]:
# this notebook is based off of this blog post: 
# https://realpython.com/python-keras-text-classification/#reader-comments

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import hstack, vstack
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential, layers
import pandas as pd
import numpy as np
import copy
import time
import pickle

pd.set_option('max_colwidth', 100)

Using TensorFlow backend.


In [2]:
splits = "splits_1571429760//"

In [3]:
data_folder = "data//"
train_path = data_folder + splits + "train.csv"
dev_path = data_folder + splits + "dev.csv"

In [4]:
pkl_filename = "models//first_model.pkl"
vectorizer_filename = "models//first_vectorizer.pkl"

In [5]:
train_df = pd.read_csv(train_path, index_col=0)

  mask |= (ar1 == a)


In [6]:
dev_df = pd.read_csv(train_path, index_col=0)

In [7]:
train_df = train_df.sample(frac=1)
dev_df = dev_df.sample(frac=1)

In [9]:
y_train = train_df.op_gender.values
y_dev = dev_df.op_gender.values

In [10]:
def turn_to_ints(li):
    final_list = []
    for gender in li:
        if gender=='M':
            final_list.append(1)
        else:
            final_list.append(0)
    return final_list
            
y_train = turn_to_ints(y_train)
y_dev = turn_to_ints(y_dev)

In [26]:
y_train = np.asarray(y_train)
y_dev = np.asarray(y_dev)

In [11]:
sentences_train = train_df.response_text.values

In [12]:
def get_text_list(init_list):
    sentences = []
    for sentence in init_list:
        if type(sentence) != str:
            sentences.append("")
        else:
            sentences.append(sentence)
    return sentences

new_sentences_train = get_text_list(sentences_train)
new_sentences_test = get_text_list(dev_df.response_text.values)

In [13]:
time_start = time.time()

tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(new_sentences_train)

X_train = tokenizer.texts_to_sequences(new_sentences_train)
X_test = tokenizer.texts_to_sequences(new_sentences_test)

vocab_size = len(tokenizer.word_index) + 1

currentTime = time.gmtime(time.time() - time_start)

#Convert the gmtime struct to a string
timeStr = time.strftime("%M minutes, %S seconds", currentTime)

print("Tokenized in {}".format(timeStr))

Tokenized in 15 minutes, 32 seconds


In [14]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [29]:
with open('first_cnn_train', 'wb') as file:
    pickle.dump(X_train, file)

In [30]:
with open('first_cnn_test', 'wb') as file:
    pickle.dump(X_test, file)

In [31]:
with open('first_cnn_tokenizer', 'wb') as file:
    pickle.dump(tokenizer, file)

In [15]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix(
                     'data/glove.6B.{}d.txt'.format(embedding_dim),
                      tokenizer.word_index, embedding_dim)

In [17]:
# hmmmm....
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.15381079935942088

In [18]:
embedding_matrix.shape

(1113992, 100)

In [19]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          111399200 
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 111,464,629
Trainable params: 65,429
Non-trainable params: 111,399,200
_________________________________________________________________


In [28]:
time_start = time.time()

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_test, y_dev),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

currentTime = time.gmtime(time.time() - time_start)

#Convert the gmtime struct to a string
timeStr = time.strftime("%M minutes, %S seconds", currentTime)

print("Tokenized in {}".format(timeStr))

Train on 9879016 samples, validate on 9879016 samples
Epoch 1/50
   3160/9879016 [..............................] - ETA: 55:08:31 - loss: 0.6091 - accuracy: 0.7022

KeyboardInterrupt: 