In [2]:
import numpy as np
import pandas as pd

#Deep Learning
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Activation, Dropout, SimpleRNN, LSTM, GRU

#Data Setup
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

#Visualization
import matplotlib.pyplot as plt


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
#Initialization
#Setting up macros (based on question description)
MAX_FEATURES = 10000
MAX_LEN = 100
NUM_TOPICS = 24 
all_models=[]

In [4]:
#Importing the Data
df_train = pd.read_csv('data/congress_train.csv', encoding='ISO-8859-1').dropna()
df_test = pd.read_csv('data/congress_test.csv', encoding='ISO-8859-1').dropna()
df_valid = pd.read_csv('data/congress_val.csv', encoding='ISO-8859-1').dropna()

#Conversion to lists
txt_train = [str(word) for word in list(df_train['Title'])]
txt_test =  [str(word) for word in list(df_test['Title'])]
txt_valid = [str(word) for word in list(df_valid['Title'])] 

#Conversion to categorical
y_train = to_categorical(list(df_train['Major']))
y_test = to_categorical(list(df_test['Major']))
y_valid = to_categorical(list(df_valid['Major']))

In [5]:
#Tokenization
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(txt_train)
train_seq = tokenizer.texts_to_sequences(txt_train)
test_seq = tokenizer.texts_to_sequences(txt_test)
valid_seq = tokenizer.texts_to_sequences(txt_valid)

In [6]:
#Padding
X_train = pad_sequences(train_seq, maxlen=MAX_LEN)
X_test = pad_sequences(test_seq, maxlen=MAX_LEN)
X_valid = pad_sequences(valid_seq, maxlen=MAX_LEN)

In [7]:
#Estimate a basic feed-forward network
feedfwd = Sequential()
feedfwd.add(Embedding(10000, 25, input_length=100))
feedfwd.add(Flatten())
feedfwd.add(Dense(24, activation='softmax'))
feedfwd.compile(optimizer='rmsprop', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
result_feedfwd = feedfwd.fit(X_train, y_train, 
                             validation_data=(X_valid,y_valid), 
                             epochs=50, 
                             batch_size=512)
all_models.append(result_feedfwd)

Train on 278612 samples, validate on 69649 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [8]:
#Estimate a recurrent neural network (RNN) with a layer_simple_rnn
rnn = Sequential()
rnn.add(Embedding(10000, 20, input_length=100))
rnn.add(SimpleRNN(20))
rnn.add(Dense(24, activation='softmax'))
rnn.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
result_rnn = rnn.fit(X_train, y_train,
                     validation_data=(X_valid,y_valid),
                     epochs=50,
                     batch_size=512)
all_models.append(result_rnn)

Train on 278612 samples, validate on 69649 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [9]:
lstm = Sequential()
lstm.add(Embedding(10000, 20, input_length=100))
lstm.add(LSTM(20))
lstm.add(Dense(24, activation='softmax'))
lstm.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
result_lstm = lstm.fit(X_train, y_train,
                       validation_data=(X_valid,y_valid),
                       epochs=50,
                       batch_size=512)
all_models.append(result_lstm)


Train on 278612 samples, validate on 69649 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [11]:
#Estimate an RNN with a GRU layer
gru = Sequential()
gru.add(Embedding(10000, 20, input_length=100))
gru.add(GRU(20))
gru.add(Dense(24, activation='softmax'))
gru.compile(optimizer='rmsprop',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
result_gru = gru.fit(X_train, y_train, 
                     validation_data=(X_valid,y_valid), 
                     epochs=50, 
                     batch_size=512)
all_models.append(result_gru)

Train on 278612 samples, validate on 69649 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
#Neural Network vs Hand-Coding

In [None]:
rnn_dropout = Sequential()
rnn_dropout.add(Embedding(10000, 25, input_length=100))
rnn_dropout.add(SimpleRNN(25, dropout=0.2))
rnn_dropout.add(Dense(24, activation='softmax'))
rnn_dropout.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
result_rnn_dropout = rnn_dropout.fit(X_train, y_train,
                                     validation_data=(X_valid,y_valid),
                                     epochs=25,
                                     batch_size=512)
all_models.append(result_rnn_dropout)

In [None]:
lstm_dropout = Sequential()
lstm_dropout.add(Embedding(10000, 25, input_length=100))
lstm_dropout.add(LSTM(25, dropout=0.2))
lstm_dropout.add(Dense(24, activation='softmax'))
lstm_dropout.compile(optimizer='rmsprop',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
result_lstm_dropout = lstm_dropout.fit(X_train, y_train,
                                       validation_data=(X_valid,y_valid),
                                       epochs=25,
                                       batch_size=512)
all_models.append(result_lstm_dropout)

In [None]:
gru_dropout = Sequential()
gru_dropout.add(Embedding(10000, 25, input_length=100))
gru_dropout.add(GRU(25, dropout=0.2))
gru_dropout.add(Dense(24, activation='softmax'))
gru_dropout.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
result_gru_dropout = gru_dropout.fit(X_train, y_train,
                                     validation_data=(X_valid,y_valid),
                                     epochs=25,
                                     batch_size=512)
all_models.append(result_gru_dropout)

In [None]:
rnn_2layer = Sequential()
rnn_2layer.add(Embedding(10000, 25, input_length=100))
rnn_2layer.add(SimpleRNN(25, return_sequences=True))
rnn_2layer.add(SimpleRNN(25))
rnn_2layer.add(Dense(24, activation='softmax'))
rnn_2layer.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
result_rnn_2layer = rnn_2layer.fit(X_train, y_train,
                                   validation_data=(X_valid,y_valid),
                                   epochs=25,
                                   batch_size=512)
all_models.append(result_rnn_2layer)

In [None]:
rnn_3layer = Sequential()
rnn_3layer.add(Embedding(10000, 25, input_length=100))
rnn_3layer.add(SimpleRNN(25, return_sequences=True))
rnn_3layer.add(SimpleRNN(25, return_sequences=True))
rnn_3layer.add(SimpleRNN(25))
rnn_3layer.add(Dense(24, activation='softmax'))
rnn_3layer.compile(optimizer='rmsprop',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
result_rnn_3layer = rnn_3layer.fit(X_train, y_train,
                                   validation_data=(X_valid,y_valid),
                                   epochs=25,
                                   batch_size=512)
all_models.append(result_rnn_3layer)

In [None]:
def plot_acc(model):   
    plt.plot(model.history['val_acc'])

def plot_loss:
    plt.plot(model.history['val_loss'])
    

In [None]:
#Combined Plot- Accuracies
for model in all_models:
    plot_acc(model)

plt.title('Accuracy on the Validation Set for All models Across Epochs')
plt.legend(['Basic Feed Forward', 'Basic RNN', 
            'Basic LSTM','Basic GRU',
            'RNN with Dropout', 
            'LSTM with Dropout','GRU with Dropout',
           'RNN with 2 layers', 'RNN with 3 layers'])
plt.show()

In [None]:
#Combined Plot- Loss
for model in all_models:
    plot_loss(model)

plt.title('Loss on the Validation Set for All models Across Epochs')
plt.legend(['Basic Feed Forward', 'Basic RNN', 
            'Basic LSTM','Basic GRU',
            'RNN with Dropout', 
            'LSTM with Dropout','GRU with Dropout',
           'RNN with 2 layers', 'RNN with 3 layers'])
plt.show()


As per these plot, we can see that the highest performing one is the LSTM. 

In [13]:
#Now we use this model on the validation data
lstm.evaluate(X_test, y_test)



[0.5719151524522886, 0.8529708215971779]

The accuracy is 