In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re
import sklearn as sk
from sklearn.naive_bayes import MultinomialNB
import nltk
import matplotlib.pyplot as plt
# from __future__ import print_function
from __future__ import division
from os import listdir
import re
import collections

## Utilizing data loading functions from Classification-LSTM.ipynb
With some modifications, since words don't need to be tokenized.

In [2]:
def load_train_data(train_data_dir):
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    file_names = []
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        print author, author_id

        for file_name in listdir(author_path):
            file_names.append(file_name)
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(np.array(current))
                
#     vocab = Vocabulary(all_tokens)
    vocab = None
    # replace words with ids
#     for i, x in enumerate(X):
#         # X[i] = np.array(x) # This line can be used to make sure your words are useful 
#         X[i] = np.array(vocab.words_to_ids(x))

    return vocab, X, y, author_to_id, file_names


def id_to_author(author_to_id, id):
    '''
    Takes a dictionary mapping author names to IDs and an ID, and returns
    the author mapped to that ID
    '''
    for author, author_id in author_to_id.iteritems():
        if id == author_id:
            return author
        
def load_eval_data(vocab, eval_data_dir):
#     eval_X = {}
#     eval_y = {}
    eval_X = []
    eval_y = []
    
    file_names = []
    for author_id, author in enumerate(listdir(eval_data_dir)):
        author_path = "%s/%s" % (eval_data_dir, author)

        for file_name in listdir(author_path):
            file_names.append(file_name)
            full_path = "%s/%s" % (author_path, file_name)
            
            with open(full_path, "r") as f:
#                 current = vocab.words_to_ids(canonicalize_words(f.read()))
                current = canonicalize_words(f.read())
                
            expanded_X = np.array(current)
            id = file_name.split("_")[2].split(".")[0]
#             eval_X[id] = np.array([expanded_X])
#             eval_y[id] = np.array([author_to_id[author]])
            eval_X.append(np.array([expanded_X]))
            eval_y.append(np.array([author_to_id[author]]))
                
    return eval_X, eval_y, file_names

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def replace_all(text, dic):
    for i, j in dic.iteritems():
        text = text.replace(i, j)
    return text

def canonicalize_words(words):
    current = []
    rep_dict = {'\n':' '
                ,'\xc2':' '
                ,'\xa0':' '
                ,'\xc2':' '
                ,'\xc3':' '
                ,'\xa9':' '
                ,'\xef':' '
                ,'\xbb':' '
                ,'\xbf':' '
                ,'\xa6':' '
                ,'\xb9':' '
                ,'\xa3':' '
                ,'\xbd':' '
                ,'\xb4':' '
                ,'\xcb':' '
                ,'\x9a':' '
                ,'\x86':' '
                ,'\xcf':' '
                ,'\x84':' '
                ,'\xce':' '
                ,'\x87':' '
                ,'\xe2':' '
                ,'\x80':' '
                ,'\x94':' '
               }
    for word in replace_all(words, rep_dict).split(' '):   
        if word:
            if word[-1] in ('.', ',', '?', ';', '!'):
                punk = word[-1]
                current.append(punk)
                word = word[0:-1]

            word = canonicalize_word(word)
            current.append(word)
    return current

In [3]:
train_data_dir = './train_final'
vocab, X_train, y_train, author_to_id, train_file_names = load_train_data(train_data_dir)
num_classes = len(np.unique(y_train))

eval_X, eval_y, eval_file_names = load_eval_data(vocab, "unknown_data")
test_X, test_y, test_file_names = load_eval_data(vocab, "test_final")

alexander_hamilton 0
james_madison 1


## Converting data and labels to dataframes for easier handling in code

#### Converting data and labels to lists

In [4]:
X_train = [' '.join(text) for text in X_train]
y_train = list(y_train)
test_y = [val[0] for val in test_y]
test_X = [val[0] for val in test_X]
test_X = [' '.join(text) for text in test_X]
eval_y = [val[0] for val in eval_y]
eval_X = [val[0] for val in eval_X]
eval_X = [' '.join(text) for text in eval_X]

#### Creating Dataframes from data and labels

In [5]:
def create_bow_df(data, labels):
    df = pd.DataFrame(data, columns=['body'])
    df['label'] = labels
    df['author'] = df['label'].apply(lambda aid: id_to_author(author_to_id, aid))
    df['author_last'] = df['author'].apply(lambda name: name.split('_')[1])
    return df

def add_df_data_counts(df):
    df['n_sentences'] = df.body.apply(lambda x: len(nltk.sent_tokenize(x)))
    df['n_words'] = df.body.apply(lambda x: len(nltk.word_tokenize(x)))
    df['n_characters'] = df.body.apply(lambda x: len(x))
    return df

def get_paper_nums(paper_file_names):
    '''
    Takes a list of federalist paper file names and returns the paper number from each file name in the same order.
    '''
    return [int(re.findall("\d+", f_name)[0]) for f_name in paper_file_names]

In [6]:
## Creating dataframes
train_df = create_bow_df(X_train, y_train)
test_df = create_bow_df(test_X, test_y)
eval_df = create_bow_df(eval_X, eval_y)

## Adding paper numbers
train_df['paper_nums'] = get_paper_nums(train_file_names)
test_df['paper_nums'] = get_paper_nums(test_file_names)
eval_df['paper_nums'] = get_paper_nums(eval_file_names)

# restricting eval dataframe to unkown papers
unknown_papers = [18,19,20,49,50,51,52,53,54,55,56,57,58,62,63]
eval_df = eval_df[eval_df['paper_nums'].isin(unknown_papers)]

# combining train and test into a single dataframe
train_test_df = train_df.append(test_df)

## Adding counts for sentences, words, and characters
train_df = add_df_data_counts(train_df)
test_df = add_df_data_counts(test_df)
eval_df = add_df_data_counts(eval_df)
train_test_df = add_df_data_counts(train_test_df)

#### Viewing data counts

In [7]:
train_df.groupby('author')[['n_sentences', 'n_words', 'n_characters']].sum()

Unnamed: 0_level_0,n_sentences,n_words,n_characters
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alexander_hamilton,654,23481,127135
james_madison,568,19327,105476


In [8]:
test_df.groupby('author')[['n_sentences', 'n_words', 'n_characters']].sum()

Unnamed: 0_level_0,n_sentences,n_words,n_characters
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alexander_hamilton,1586,58086,314910
james_madison,530,21043,116343


In [9]:
eval_df.groupby('author')[['n_sentences', 'n_words', 'n_characters']].sum()

Unnamed: 0_level_0,n_sentences,n_words,n_characters
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
james_madison,1013,33201,182512


In [10]:
train_test_df.groupby('author')[['n_sentences', 'n_words', 'n_characters']].sum()

Unnamed: 0_level_0,n_sentences,n_words,n_characters
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alexander_hamilton,2240,81567,442045
james_madison,1098,40370,221819


# BOW Classifiers

In [11]:
def train_BOW(training_df, stop_words=None, use_tfidf=False):
    # Selecting appropriate vectorizor type
    if use_tfidf:
        vectorizor = sk.feature_extraction.text.TfidfVectorizer
    else:
        vectorizor = sk.feature_extraction.text.CountVectorizer
        
    # Creating count vectorizor
    cv = vectorizor(analyzer = "word", stop_words=stop_words)
    
    # fitting bag of words model and learning the vocabulary
    train_features = cv.fit_transform(training_df.body)
    vocab = cv.get_feature_names()
    
    # training model
    clf = MultinomialNB()
    clf.fit(train_features, training_df.label)
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    
    print('Learned vocab size:', len(vocab))
    print('Shape of term-document matrix [n_samples, vocabulary_size]:', train_features.shape)
    
    return vocab, cv, clf

def predict_BOW(prediction_df, cv, clf):
    pred_features = cv.transform(prediction_df.body)
    predictions = clf.predict(pred_features)
    if 'label' in prediction_df.columns:
        actual = np.array(prediction_df.label)
        print '\nPredicted and Actual'
    else:
        actual = None
        print '\Predicted, no Actual for reference'
    
    correct = sum(predictions == actual)
    accuracy = float(correct) / len(predictions)
    
    # creating dataframe of results with author names
    results_df = prediction_df[['paper_nums','author_last']].copy()
    results_df.rename(columns={'author_last':'actual'}, inplace=True)
    results_df['predictions'] = [id_to_author(author_to_id, pred_id) for pred_id in predictions]
    results_df['predictions'] = results_df['predictions'].apply(lambda name: name.split('_')[1])
    results_df['is_correcct'] = results_df['actual'] == results_df['predictions']
    
    print predictions
    print actual
    print '\nAccuracy: ', "%.2f" % round(accuracy*100,2), '%'
    return predictions, actual, correct, accuracy, results_df

## Predicting on Test Papers
Training is on all four combinations of None/english stop words and Tfdif/no Tfdif

In [12]:
vocab, cv, clf = train_BOW(train_df, stop_words=None, use_tfidf=False)
predictions, actual, correct, accuracy, results_df = predict_BOW(test_df, cv, clf)

('Learned vocab size:', 4457)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4457))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]

Accuracy:  84.38 %


In [13]:
vocab, cv, clf = train_BOW(train_df, stop_words='english', use_tfidf=False)
predictions, actual, correct, accuracy, results_df = predict_BOW(test_df, cv, clf)

('Learned vocab size:', 4214)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4214))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]

Accuracy:  75.00 %


In [14]:
vocab, cv, clf = train_BOW(train_df, stop_words=None, use_tfidf=True)
predictions, actual, correct, accuracy, results_df = predict_BOW(test_df, cv, clf)

('Learned vocab size:', 4457)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4457))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]

Accuracy:  78.13 %


In [15]:
vocab, cv, clf = train_BOW(train_df, stop_words='english', use_tfidf=True)
predictions, actual, correct, accuracy, results_df = predict_BOW(test_df, cv, clf)

('Learned vocab size:', 4214)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4214))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]

Accuracy:  78.13 %


## Predicting on Unkown Papers
Training is on all four combinations of None/english stop words and Tfdif/no Tfdif

In [16]:
vocab, cv, clf = train_BOW(train_df, stop_words=None, use_tfidf=False)
predictions, actual, correct, accuracy, results_df = predict_BOW(eval_df, cv, clf)

('Learned vocab size:', 4457)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4457))

Predicted and Actual
[1 1 1 0 1 1 1 0 0 1 0 0 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Accuracy:  66.67 %


In [17]:
vocab, cv, clf = train_BOW(train_df, stop_words='english', use_tfidf=False)
predictions, actual, correct, accuracy, results_df = predict_BOW(eval_df, cv, clf)

('Learned vocab size:', 4214)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4214))

Predicted and Actual
[1 1 1 1 1 1 0 0 0 1 0 0 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Accuracy:  66.67 %


In [18]:
vocab, cv, clf = train_BOW(train_df, stop_words=None, use_tfidf=True)
predictions, actual, correct, accuracy, results_df = predict_BOW(eval_df, cv, clf)

('Learned vocab size:', 4457)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4457))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Accuracy:  0.00 %


In [19]:
vocab, cv, clf = train_BOW(train_df, stop_words='english', use_tfidf=True)
predictions, actual, correct, accuracy, results_df = predict_BOW(eval_df, cv, clf)

('Learned vocab size:', 4214)
('Shape of term-document matrix [n_samples, vocabulary_size]:', (16, 4214))

Predicted and Actual
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Accuracy:  0.00 %
