In [1]:
# import libraries
import pandas as pd
import numpy as np

# to make this notebook's output stable across runs
np.random.seed(42)
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import re

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# set options for rendering plots
%matplotlib inline

# display multiple outputs within a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all";

# TensorFlow and tf.keras
from tensorflow import keras
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore');

train = pd.read_csv("C:/Users/dreww/Documents/train.csv")
test = pd.read_csv("C:/Users/dreww/Documents/test.csv")

In [3]:
import nltk
#nltk.download()

In [4]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
stop_words = set(stopwords.words('english')) 

In [5]:
# Define functions for stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
text = remove_stopwords(train["text"])
test_txt = remove_stopwords(test["text"])

In [6]:
data = np.array(text)
data.shape
labels = np.array(train["author"])
labels.shape

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#train_text = remove_stopwords(train['text'])
#test_text = remove_stopwords(test['text'])

# keep only maxlen words per sentence
maxlen = 200

# take cutoff of most common words
max_words = 20000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

(19579,)

(19579,)

Found 24882 unique tokens.
Shape of data tensor: (19579, 200)
Shape of label tensor: (19579,)


In [7]:
# convert labels to floats
labels[labels == 'EAP'] = 0
labels[labels == 'HPL'] = 1
labels[labels == 'MWS'] = 2

In [85]:
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(max_words, 32))
model.add(SimpleRNN(32, dropout=0.5))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model.fit(data, labels,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.2)

Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import Dropout, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(max_words, 36))
model.add(Dropout(0.5))
model.add(GlobalAveragePooling1D())
#model.add(LSTM(32))
model.add(Dense(12, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])

history = model.fit(data, 
                    labels,
                    epochs=25,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25


In [9]:
# make test predictions
test_txt = tokenizer.texts_to_sequences(test_txt)
X_test = pad_sequences(sequences=test_txt, maxlen=maxlen)

predictions = model.predict(X_test, batch_size=16)
test['EAP'] = predictions[:, 0]
test['HPL'] = predictions[:, 1]
test['MWS'] = predictions[:, 2]

# final submission
test[['id', 'EAP', 'HPL', 'MWS']].to_csv('submission.csv', index=False)