In [156]:
import os
import re
import sys
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer
STOPS = stopwords.words("english")

In [157]:
train_df = pd.read_csv('data/train.csv')

## Clean texts
def clean_text(text):
    """Takes in text and returns clean word list"""
    output = re.sub('[^a-zA-Z ]', '', text).lower()
    output = output.split(' ')
    output = [o for o in output if o not in STOPS and len(o) > 0 and o[:4] != 'http']
    return ' '.join(output)

train_df['text'] = train_df['text'].apply(clean_text)

## Replace labels and split into train, test and validation sets
def replace_labels(author):
    label_dict = {'EAP': 0, 'HPL': 1, 'MWS': 2}
    return label_dict[author]

def replace_labels_vec(author):
    label_dict = {'EAP': np.array([1, 0, 0]), 'HPL': np.array([0, 1, 0]), 'MWS': np.array([0, 0, 1])}
    return label_dict[author]

train_df['author_vector'] = train_df['author'].apply(replace_labels_vec)
train_df['author'] = train_df['author'].apply(replace_labels)
train_df, test_df = train_test_split(train_df, test_size=0.2)

In [158]:
## Split out sentences and form TF-IDF representations of text

## Fit TF-DF transformer to training data
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(train_df['text'].tolist())
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train = tf_transformer.transform(X_train_counts)

## Transform test and validation sets
X_test_counts = count_vec.transform(test_df['text'].tolist())
X_test = tf_transformer.transform(X_test_counts)

## Labels
y_train = np.array(train_df['author'])
y_test = np.array(test_df['author'])

In [159]:
## Build a pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [160]:
## Grid search for alpha and bigrams/words
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)], 
    'clf__alpha': (0.001, 0.01, 0.05, 0.08, 0.1, 0.2, 0.5, 0.8, 1)
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
gs_clf = gs_clf.fit(train_df['text'].tolist(), np.array(train_df['author']))

## Print best score and parameters
print("Best score = {}".format(gs_clf.best_score_))
for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name, gs_clf.best_params_[param_name]))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   15.6s finished


Best score = 0.8221924280150673
clf__alpha: 0.08
vect__ngram_range: (1, 2)


In [161]:
## Evaluating on train and test sets
def cross_entropy(preds_mat, labels_mat):
    """both should by (3, ) vectors"""
    return -np.mean(preds_mat * labels_mat)
    
## Training set
train_preds = gs_clf.predict_log_proba(train_df['text'].tolist())
train_labels = np.stack(train_df['author_vector'].tolist())
train_Xent = cross_entropy(train_preds, train_labels)
train_acc = np.mean(gs_clf.predict(train_df['text'].tolist()) == np.array(train_df['author']))
print("Train cross entropy = {}, train acc = {}".format(train_Xent, train_acc))

## Test set
test_preds = gs_clf.predict_log_proba(test_df['text'].tolist())
test_labels = np.stack(test_df['author_vector'].tolist())
test_Xent = cross_entropy(test_preds, test_labels)
test_acc = np.mean(gs_clf.predict(test_df['text'].tolist()) == np.array(test_df['author']))
print("Test cross entropy = {}, test acc = {}".format(test_Xent, test_acc))

Train cross entropy = 0.010503672089702935, train acc = 0.9982761923003256
Test cross entropy = 0.1435777121526654, test acc = 0.8378447395301328


In [162]:
## Re-train on full training set (train+test)
full_train_df = pd.concat([train_df, test_df])
gs_clf_final = gs_clf.fit(full_train_df['text'].tolist(), np.array(full_train_df['author']))

## Print best score and parameters
print("Best score = {}".format(gs_clf.best_score_))
for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name, gs_clf.best_params_[param_name]))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   17.0s finished


Best score = 0.8310945400684406
clf__alpha: 0.1
vect__ngram_range: (1, 1)


In [153]:
## Read in final test set and make predictions
final_test_df = pd.read_csv('data/test.csv')
final_test_df['text'] = final_test_df['text'].apply(clean_text)

In [154]:
## Correct format for submission
predictions = gs_clf_final.predict_proba(final_test_df['text'].tolist())
final_test_df['EAP'] = predictions[:, 0]
final_test_df['HPL'] = predictions[:, 1]
final_test_df['MWS'] = predictions[:, 2]
final_test_df = final_test_df.drop(['text'], axis=1)

In [155]:
## Save output to CSV file
final_test_df.to_csv('data/second_submission.csv', index=False, header=True)