In [42]:
import os
import re
import sys
import time
import random
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer
STOPS = stopwords.words("english")

In [54]:
train_df = pd.read_csv('data/train.csv')

## Clean texts
def clean_text(text):
    """Takes in text and returns clean word list"""
    output = re.sub('[^a-zA-Z0-9 ]', '', text).lower()
    output = output.split(' ')
    output = [o for o in output if o not in STOPS and len(o) > 0]
    return ' '.join(output)

train_df['text'] = train_df['text'].apply(clean_text)

## Replace labels and split into train, test and validation sets
def replace_labels(author):
    label_dict = {'EAP': 0, 'HPL': 1, 'MWS': 2}
    return label_dict[author]

def replace_labels_vec(author):
    label_dict = {'EAP': np.array([1, 0, 0]), 'HPL': np.array([0, 1, 0]), 'MWS': np.array([0, 0, 1])}
    return label_dict[author]

train_df['author_vector'] = train_df['author'].apply(replace_labels_vec)
train_df['author'] = train_df['author'].apply(replace_labels)
train_df, test_df = train_test_split(train_df, test_size=0.2)

In [55]:
## Get full training set vocabulary
full_train_df = pd.concat([train_df, test_df])
word_string = ' '.join(full_train_df['text'].tolist())
train_vocab = list(set(word_string.split(' ')))
print("Size of training vocab = {}".format(len(train_vocab)))

Size of training vocab = 25270


In [56]:
## Split out sentences and form TF-IDF representations of text

## Fit TF-DF transformer to training data
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(train_df['text'].tolist())
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train = tf_transformer.transform(X_train_counts)

## Transform test and validation sets
X_test_counts = count_vec.transform(test_df['text'].tolist())
X_test = tf_transformer.transform(X_test_counts)

## Labels
y_train = np.array(train_df['author'])
y_test = np.array(test_df['author'])

In [57]:
## Build a pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [58]:
## Grid search for alpha and bigrams/words
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 
    'clf__alpha': (0.04, 0.05, 0.06, 0.07,0.08, 0.1),
    'vect__max_df': (0.7, 0.8, 0.9, 1.0),
    'vect__min_df': (1, 2)
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
gs_clf = gs_clf.fit(train_df['text'].tolist(), np.array(train_df['author']))

## Print best score and parameters
print("Best score = {}".format(gs_clf.best_score_))
for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name, gs_clf.best_params_[param_name]))

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  3.0min finished


Best score = 0.8251931303070932
clf__alpha: 0.08
vect__max_df: 0.7
vect__min_df: 1
vect__ngram_range: (1, 2)


In [59]:
## Evaluating on train and test sets
def cross_entropy(preds_mat, labels_mat):
    """both should by (3, ) vectors"""
    return -np.mean(preds_mat * labels_mat)
    
## Training set
train_preds = gs_clf.predict_log_proba(train_df['text'].tolist())
train_labels = np.stack(train_df['author_vector'].tolist())
train_Xent = cross_entropy(train_preds, train_labels)
train_acc = np.mean(gs_clf.predict(train_df['text'].tolist()) == np.array(train_df['author']))
print("Train cross entropy = {}, train acc = {}".format(train_Xent, train_acc))

## Test set
test_preds = gs_clf.predict_log_proba(test_df['text'].tolist())
test_labels = np.stack(test_df['author_vector'].tolist())
test_Xent = cross_entropy(test_preds, test_labels)
test_acc = np.mean(gs_clf.predict(test_df['text'].tolist()) == np.array(test_df['author']))
print("Test cross entropy = {}, test acc = {}".format(test_Xent, test_acc))

Train cross entropy = 0.009267132728268566, train acc = 0.9985315712187959
Test cross entropy = 0.14100176501433925, test acc = 0.8465270684371808


In [60]:
## Re-train on full training set (train+test)
gs_clf_final = gs_clf.fit(full_train_df['text'].tolist(), np.array(full_train_df['author']))

## Print best score and parameters
print("Best score = {}".format(gs_clf.best_score_))
for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name, gs_clf.best_params_[param_name]))

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  3.3min finished


Best score = 0.8396240870320241
clf__alpha: 0.04
vect__max_df: 0.7
vect__min_df: 1
vect__ngram_range: (1, 2)


In [61]:
## Read in final test set
final_test_df = pd.read_csv('data/test.csv')
final_test_df['text'] = final_test_df['text'].apply(clean_text)

In [62]:
## Get test set vocabulary
word_string = ' '.join(final_test_df['text'].tolist())
test_vocab = list(set(word_string.split(' ')))
print("Size of test vocab = {}".format(len(test_vocab)))

diff = len(test_vocab) - len(set(train_vocab).intersection(set(test_vocab)))
print("Number of words not in training set = {}".format(diff))

Size of test vocab = 17623
Number of words not in training set = 3310


In [28]:
# ## Load Word2Vec model
# model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', 
#                                                         binary=True, limit=100000)

In [35]:
# ## Replace words in test set with similar words in training set
# train_vocab_set = set(train_vocab)

# def get_most_similar(word, vocab=train_vocab_set):
#     """Gets most similar word2vec word in training set vocab"""
#     if word not in vocab:
#         most_similar_word = model.most_similar(['word'], [], 1)[0][0]
#         if most_similar_word in vocab:
#             return most_similar_word
#     return word

# def replace_words(text):
#     """Replaces with similar training set words"""
#     output = text.split(' ')
#     output = [get_most_similar(o) for o in output]
#     return ' '.join(output)

# final_test_df['text'] = final_test_df['text'].apply(replace_words)

# ## Check test set vocabulary again
# word_string = ' '.join(final_test_df['text'].tolist())
# test_vocab = list(set(word_string.split(' ')))
# print("Size of test vocab = {}".format(len(test_vocab)))

# diff = len(test_vocab) - len(set(train_vocab).intersection(set(test_vocab)))
# print("Number of words not in training set = {}".format(diff))

In [64]:
## Make predictions correct format for submission
predictions = gs_clf_final.predict_proba(final_test_df['text'].tolist())
final_test_df['EAP'] = predictions[:, 0]
final_test_df['HPL'] = predictions[:, 1]
final_test_df['MWS'] = predictions[:, 2]
final_test_df = final_test_df.drop(['text'], axis=1)

In [65]:
## Save output to CSV file
final_test_df.to_csv('data/third_submission.csv', index=False, header=True)