In [1]:
import os
from multiprocessing import Pool

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from IPython.display import display

from feature_extraction import extract_features, get_feature_names
from text_processing import get_instances

In [2]:
TRAIN_DOCS_DIR = './documents/set_I'
TEST_DOCS_DIR = './documents/set_J'

### Get instances from the training set

In [3]:
%%time
instances = []
for i in range(200):
    text_file = os.path.join(TRAIN_DOCS_DIR, f'{i}.txt')

    if not os.path.isfile(text_file):
        continue

    with open(text_file) as f:
        text = f.read()

    instances += get_instances(text, i)

CPU times: user 1.13 s, sys: 53.4 ms, total: 1.19 s
Wall time: 1.24 s


**Example instance:**

In [4]:
display(instances[0])

Instance(term=['On'], label=False, pre=[], post=['their', 'way', 'to', 'perform', 'in', 'Guam', 'for', 'the', 'troops', ',', 'nightclub', 'performers', 'Duke', 'Mitchell', 'and', 'Sammy', 'Petrillo', 'find', 'themselves', 'stranded', 'on', 'a', 'seemingly', 'treacherous', 'island', ',', 'known', 'by', 'the', 'natives', 'as', '``', 'Kola', 'Kola', "''", '.'], term_pos=['OTHER'], pre_pos=[], post_pos=['OTHER', 'NOUN', 'OTHER', 'VERB', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'NOUN', 'NOUN', 'OTHER', 'NOUN', 'NOUN', 'VERB', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'VERB', 'OTHER', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'NOUN', 'NOUN', 'OTHER', 'OTHER'], file_idx=0)

### Extract feature vectors from instances

In [5]:
%%time
feature_names = get_feature_names()

with Pool() as p:
    features = p.map(extract_features, instances)

data = pd.DataFrame(features, columns=feature_names)

CPU times: user 66.5 ms, sys: 61.1 ms, total: 128 ms
Wall time: 2.95 s


**Example feature vectors:**  
See the documentation in feature_extraction.py for detailed explaination of each feature.

In [6]:
pd.set_option('display.max_columns', 500)
display(data.head())

Unnamed: 0,n_words,avg_word_len,has_title_prefix,contains_title,has_preceding_article,contains_article,has_possessive_suffix,has_following_parenthesis,is_in_parenthesis,is_beginning_of_sentence,has_preceding_named,has_punctuation,has_stopwords,has_following_article,has_verb_nearby,has_pronoun_nearby,has_dictionary_word,is_all_dictionary_words,preceding_preposition,following_wh_word,prev_word_suffix,term_pos,next_word_pos,prev_word_pos,label
0,1,2,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,True,,,,OTHER,OTHER,,False
1,1,4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,in,,in,NOUN,OTHER,OTHER,False
2,1,4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,,,rs,NOUN,NOUN,NOUN,True
3,1,8,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,,,rs,NOUN,OTHER,NOUN,True
4,1,5,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,,,nd,NOUN,NOUN,OTHER,True


### Preprocess data
Apply one-hot encoding to categorical features, and standardize numeric features

In [7]:
%%time
numeric_features = data.dtypes[data.dtypes == int].keys().values.tolist()
categorical_features = data.dtypes[data.dtypes == object].keys().values.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numeric_features)],
    remainder='passthrough')

D_train = preprocessor.fit_transform(data)

X_train = D_train[:, :-1].toarray()
y_train = D_train[:, -1].astype(int).toarray().flatten()

CPU times: user 42.1 ms, sys: 16.3 ms, total: 58.4 ms
Wall time: 58.8 ms


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [8]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')

X_train shape: (4043, 185)
y_train shape: (4043,)


### Perform cross-validation

In [9]:
%%time
classifiers = [DecisionTreeClassifier(random_state=0),
               RandomForestClassifier(random_state=0),
               SVC(kernel="linear", random_state=0),
               RidgeClassifier(random_state=0),
               LogisticRegression(solver='lbfgs', tol=1e-3, max_iter=400, random_state=0),
               SGDClassifier(loss="log", random_state=0),
               MLPClassifier(early_stopping=True, random_state=0),
               AdaBoostClassifier(random_state=0),
               KNeighborsClassifier(3)]

cv_scores = pd.DataFrame(columns=['Classifier', 'Precision', 'Recall', 'F1'])
for i, clf in enumerate(classifiers):
    s = cross_validate(clf, X_train, y_train, scoring=['recall', 'precision', 'f1'], cv=3, return_train_score=False)
    cv_scores.loc[i] = [clf.__class__.__name__,
                        s['test_precision'].mean(),
                        s['test_recall'].mean(),
                        s['test_f1'].mean()]



CPU times: user 16.5 s, sys: 65.3 ms, total: 16.5 s
Wall time: 7.65 s


In [10]:
display(cv_scores)

Unnamed: 0,Classifier,Precision,Recall,F1
0,DecisionTreeClassifier,0.87061,0.879043,0.874477
1,RandomForestClassifier,0.891753,0.90583,0.89862
2,SVC,0.885724,0.945575,0.914566
3,RidgeClassifier,0.878819,0.95335,0.914461
4,LogisticRegression,0.894846,0.941684,0.917546
5,SGDClassifier,0.921422,0.850974,0.884346
6,MLPClassifier,0.901715,0.938223,0.919375
7,AdaBoostClassifier,0.895683,0.938661,0.916342
8,KNeighborsClassifier,0.877466,0.897188,0.887216


### Select the best classifier and apply on test set

In [11]:
%%time
clf = MLPClassifier(early_stopping=True, random_state=0)
clf.fit(X_train, y_train)

instances_ = []

for i in range(200, 300):
    text_file = os.path.join(TEST_DOCS_DIR, f'{i}.txt')

    if not os.path.isfile(text_file):
        continue

    with open(text_file) as f:
        text = f.read()

    instances_ += get_instances(text, i)

with Pool() as p:
    features_ = p.map(extract_features, instances_)

data_ = pd.DataFrame(features_, columns=feature_names)

D_test = preprocessor.transform(data_)

X_test = D_test[:, :-1].toarray()
y_test = D_test[:, -1].astype(int).toarray().flatten()

y_test_predicted = clf.predict(X_test)

CPU times: user 3.34 s, sys: 79.8 ms, total: 3.42 s
Wall time: 2.16 s


  res = transformer.transform(X)


In [12]:
print(f'precision: {precision_score(y_test, y_test_predicted)}')
print(f'recall: {recall_score(y_test, y_test_predicted)}')
print(f'f1: {f1_score(y_test, y_test_predicted)}')

precision: 0.9200359389038635
recall: 0.9126559714795008
f1: 0.916331096196868
