NLP - Author Attribution

- Data Prep
- Corpora
- Crossvalidation
- Baseline
- Classifiers

In [1]:
import pandas as pd
import nltk
import glob
import string
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import CategoricalNB, MultinomialNB
from sklearn.datasets import load_files
from sklearn import metrics


In [64]:
# Please make sure you have the following parts downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/d4ve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Use a quick bash loop to create csv files with bag of words for all training books.
Also clean all txt files by removing portuguse accents, removing all spacing and lower the casing.

In [65]:
%%bash
rm -rf trainData/*/*tokenised.csv
rm -rf trainData/*/*clean.txt
for dir in trainData/*/; do
    for book in $dir*.txt; do
        printf "Tokenising: $book\n"
        tr -sc 'A-Za-z' '\n' < "$book" | tr A-Z a-z | sort | uniq -c | sort -nr | awk '{print $1, $2}' | tr " " "," > "${book%.*}"_tokenised.csv
        printf "Cleaning up: $book \n\n"
        iconv -f utf8 -t ascii//TRANSLIT "$book" | tr -sc 'A-Za-z0-9' ' ' | tr A-Z a-z  > "${book%.*}"_clean.txt
    done
done

Tokenising: trainData/almadaNegreiros/pg22615.txt
Cleaning up: trainData/almadaNegreiros/pg22615.txt 

Tokenising: trainData/almadaNegreiros/pg22730.txt
Cleaning up: trainData/almadaNegreiros/pg22730.txt 

Tokenising: trainData/almadaNegreiros/pg22801.txt
Cleaning up: trainData/almadaNegreiros/pg22801.txt 

Tokenising: trainData/almadaNegreiros/pg22802.txt
Cleaning up: trainData/almadaNegreiros/pg22802.txt 

Tokenising: trainData/almadaNegreiros/pg22969.txt
Cleaning up: trainData/almadaNegreiros/pg22969.txt 

Tokenising: trainData/almadaNegreiros/pg23133.txt
Cleaning up: trainData/almadaNegreiros/pg23133.txt 

Tokenising: trainData/almadaNegreiros/pg23620.txt
Cleaning up: trainData/almadaNegreiros/pg23620.txt 

Tokenising: trainData/almadaNegreiros/pg23879.txt
Cleaning up: trainData/almadaNegreiros/pg23879.txt 

Tokenising: trainData/almadaNegreiros/pg23961.txt
Cleaning up: trainData/almadaNegreiros/pg23961.txt 

Tokenising: trainData/almadaNegreiros/tesst.txt
Cleaning up: trainData/al

In [44]:
authors = {
            1: "almadaNegreiros",
            2: "ecaDeQueiros",
            3: "joseSaramago",
            4: "camiloCasteloBranco",
            5: "joseRodriguesSantos",
            6: "luisaMarquesSilva"}

# NLTK Tools
stopwords = list(nltk.corpus.stopwords.words('portuguese'))
stemmer = nltk.stem.RSLPStemmer()

# Training data paths
paths = glob.glob('trainData/*/')

In [46]:
def map_author(path):
    """Determine the author of a book through its ffile path."""
    for key, author in authors.items():
        if author in path:
            return author
        
def clean_doc(doc, stopwords=True):
    doc = stem_doc(doc)
    if stopwords == True:
        doc = stop_doc(doc)
    return doc

def stem_doc(doc):
    """Takes a document, splits it up and stemms each word - then
    remerges the document together and returns it."""
    doc_split = doc.split()
    stem = [stemmer.stem(str(i)) for i in doc_split]
    doc = ' '.join(stem)
    return doc

def stop_doc(doc):
    """Takes a document and removes all stopwords from it"""
    doc_split = doc.split()
    temp = [i for i in doc_split if i not in stopwords]
    doc = ' '.join(temp)
    return doc

In [48]:
from itertools import count

# Write to path - keep data in line with sklearn load data
w_path = 'cleanData/'

# n defines words to have in each doc
n = 500

# Get the corpus for each author and split them in 500 word files
# save them in the cleanData folder.
for path in paths:
    corpus = open(f"{path + 'corpus.txt'}", "r").read()
    corpus = clean_doc(corpus)
    corpus.split()
    splits = len(corpus) // n
    if len(corpus) % n > 0: splits += 1
    cut = n
    filename = ("/corpus_part_%03i.txt" % i for i in count(1))
    for i in range(splits):
        seg = ''.join(corpus[(n*i):cut])
        with open(w_path + map_author(path) + next(filename), "w") as file:
            file.write(seg)
        cut += n

In [49]:
# Use sklearn load_data it will deduce the target variables
# from the folder names - in our case the authors
book_data = load_files('cleanData/', encoding="UTF-8")

In [50]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(book_data.data, book_data.target, random_state=0)

In [57]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=1.0e-10)),
])

In [58]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1e-10, class_prior=None, fit_prior=True))],
         verbose=False)

In [59]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.9416806722689076

In [60]:
print(metrics.classification_report(y_test, predicted, target_names=book_data.target_names))

                     precision    recall  f1-score   support

    almadaNegreiros       1.00      0.21      0.34        73
camiloCasteloBranco       0.94      0.99      0.97      1310
       ecaDeQueiros       0.96      0.92      0.94       794
joseRodriguesSantos       0.93      0.97      0.95      2013
       joseSaramago       0.94      0.94      0.94      1686
  luisaMarquesSilva       1.00      0.18      0.30        74

           accuracy                           0.94      5950
          macro avg       0.96      0.70      0.74      5950
       weighted avg       0.94      0.94      0.94      5950



In [61]:
metrics.confusion_matrix(y_test, predicted)

array([[  15,   20,   19,   10,    9,    0],
       [   0, 1303,    6,    1,    0,    0],
       [   0,   50,  731,    3,   10,    0],
       [   0,    1,    0, 1961,   51,    0],
       [   0,    5,    4,   97, 1580,    0],
       [   0,    1,    0,   34,   26,   13]])

In [62]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=0.01, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.9329411764705883

In [63]:
print(metrics.classification_report(y_test, predicted, target_names=book_data.target_names))

                     precision    recall  f1-score   support

    almadaNegreiros       1.00      0.01      0.03        73
camiloCasteloBranco       0.91      0.98      0.95      1310
       ecaDeQueiros       0.99      0.80      0.89       794
joseRodriguesSantos       0.95      0.98      0.96      2013
       joseSaramago       0.91      0.97      0.94      1686
  luisaMarquesSilva       1.00      0.04      0.08        74

           accuracy                           0.93      5950
          macro avg       0.96      0.63      0.64      5950
       weighted avg       0.94      0.93      0.92      5950



In [64]:
metrics.confusion_matrix(y_test, predicted)

array([[   1,   27,    4,   10,   31,    0],
       [   0, 1289,    1,    7,   13,    0],
       [   0,   94,  636,   21,   43,    0],
       [   0,    3,    0, 1980,   30,    0],
       [   0,    1,    0,   43, 1642,    0],
       [   0,    2,    0,   33,   36,    3]])

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [22]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [26]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}