# TxMM Assignment 4 - Authorship Attribution

This 

## 1. Load data

In [None]:
import os
import string
import statistics
import math

from typing import List

import pandas as pd
import nltk
import nltk.tokenize.destructive
import contractions

nltk.download("punkt")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

data_dir = os.path.join(os.getcwd(), "Data")

def load_fanfiction_data(data_purpose: str) -> pd.DataFrame:
    path_to_data_csv = os.path.join(data_dir, f"pan2324_{data_purpose}_data.csv")
    
    return pd.read_csv(path_to_data_csv, index_col=0).sort_index()

train_data = load_fanfiction_data("train")
test_data = load_fanfiction_data("test")
dev_data = load_fanfiction_data("dev")

train_data.head()


## 2. Text preprocessing

In order to extract features from the text data, it might be required that some preprocessing steps are taken first. To this end, a collection of text preprocessing functions are defined here.

In [None]:
def _remove_punctuation(word_tokenized_text: list[str]) -> list[str]:
    word_tokenized_text_no_punctuation = []
    
    for word in word_tokenized_text:
        if word in list(string.punctuation):
            continue
        
        word_tokenized_text_no_punctuation.append(word)
        
    return word_tokenized_text_no_punctuation

def _remove_digits(word_tokenized_text: list[str]) -> list[str]:
    word_tokenized_text_no_digits = []
    
    for word in word_tokenized_text:
        if word in list(string.digits):
            continue
        
        word_tokenized_text_no_digits.append(word)
        
    return word_tokenized_text_no_digits

## 3. Feature extraction

For this assignment, a set of different features are used. These features may be split up into a number of feature categories.

In [None]:
val_text = "I'm looking for a research project where I can study the language Esperanto from a linguistic perspective. That subject's called Esperantology. I think that's quite nifty :). 10/10 idea. Gabbagool!!!"
val_text_word_tokenized = nltk.word_tokenize(val_text)
val_text_sent_tokenized = nltk.sent_tokenize(val_text)

### 3.1. Count features

Count features are lexical features that the describe the count of something within the text, like the count of words or sentences in a text.

In [None]:
def character_count(text: str) -> int:
    """Returns the count of all characters of the given parameter string.
    
    Args:
        - `text`: The string to compute the amount of characters of.
        
    Returns:
        The count of all characters in the `text` parameter.
    """
    
    return len(text)

def word_count(word_tokenized_text: str) -> int:
    """Returns the count of all words that occur in the input string. 
    This count is determined using NLTK's `word_tokenize` function.
    
    Args:
        - `word_tokenized_text`: The string to compute the amount of words of.
        
    Returns:
        The count of all words in the `word_tokenized_text` parameter.
    """
    
    return len(word_tokenized_text)

def sentence_count(sentence_tokenized_text: str) -> int:
    """Returns the count of all sentences that occur in the input string. 
    This count is determined using NLTK's `sent_tokenize` function.
    
    Args:
        - `sentence_tokenized_text`: The string to compute the amount of sentences of.
        
    Returns:
        The count of all sentences in the `sentence_tokenized_text` parameter.
    """
    return len(sentence_tokenized_text)

def punctuation_count(text: str) -> int:
    """Returns the count of punctuation occurrences in the input string.
    
    Args:
        - `text`: The string to compute the amount of punctuation occurrences of.
        
    Returns:
        The number of times a form of punctuation occurs in the `text` parameter.    
    """
    
    punctuation_count = 0
    
    for character in text:
        if character in list(string.punctuation):
            punctuation_count += 1
    
    return punctuation_count

def digit_count(text: str) -> int:
    """Returns the count of individual digits occurring in the input string.
    Note that this does not mean numbers, e.g., "23" will return "2", since the number 23 consists of two digits.
    
    Args:
        - `text`: The string to compute the amount of digit occurrences of.
        
    Returns:
        The number of times a digit occurs in the `text` parameter.
    """
    digit_count = 0
    
    for character in text:
        if character in list(string.digits):
            digit_count += 1
            
    return digit_count

def uppercase_count(text: str) -> int:
    """Returns the count of uppercase characters occurring in the input string.
    Note that only uppercase characters that are part of ASCII are supported.
    
    Args:
        - `text`: The string to compute the amount of uppercase character occurrences of.
        
    Returns:
        The number of times an uppercase character occurs in the `text` parameter.
    """
    uppercase_count = 0
    
    for character in text:
        if character in list(string.ascii_uppercase):
            uppercase_count += 1
            
    return uppercase_count

def short_word_count(word_tokenized_text: List[str], short_word_max_length: int = 4) -> int:
    """Returns the count of "short" words that occur in the `word_tokenized_text` parameter.
    The cutoff point for "short" words is given by the `short_word_max_length` parameter.
    
    Args:
        - `word_tokenized_text`: The string to compute the amount of "short" words of.
        - `short_word_max_length`: The maximum length of what is considered to be a "short" word. This length is inclusive.
        
    Returns:
        The number of times a "short" word occurs in the `word_tokenized_text` parameter.
    """     
    short_word_list = [word for word in word_tokenized_text if len(word) <= short_word_max_length]
    
    return len(short_word_list)

def alphabet_count(text: List[str], include_uppercase: bool = True, include_punctuation: bool = True, include_digits: bool = True) -> int:
    """Returns the length of the alphabet of the given text. A text's alphabet is defined as all unique characters that occur in that text.
    
    Args:
        - `text`: The string for which to compute an alphabet for.
        - `include_uppercase`: A boolean used to determine whether to count uppercase characters as separate from their lowercase counterparts.
        - `include_punctuation`: A boolean used to determine whether to include punctuation in the alphabet.
        - `include_digits`: A boolean used to determine whether to include digits in the alphabet.
        
    Returns:
        The length of the alphabet of the `text` variable.
    """
    if not include_uppercase:
        text = text.lower()
    
    text_char_alphabet = set(text)
    
    if not include_punctuation:
        text_char_alphabet = {char for char in text_char_alphabet if char not in list(string.punctuation)}
        
    if not include_digits:
        text_char_alphabet = {char for char in text_char_alphabet if char not in list(string.digits)}
    
    return len(text_char_alphabet)

def contraction_count(text: List[str], include_genetive_count: bool = True) -> int:
    """Returns the count of all contractions that occur in the given string.
    
    Args:
        - `text`: The string for which to count the number of occurring contractions.
        - `include_genetive_count`: A boolean used to determine if occurrences of the genetive should count towards the number of contractions found.
        
    Returns:
        The amount of contractions that occur in the `text` variable.
    """
    contraction_count = len(contractions.preview(text, 1))
    
    if include_genetive_count:
        tokenized_text = nltk.tokenize.word_tokenize(text)
        pos_tagged_text = nltk.tag.pos_tag(tokenized_text)
        genetive_count = len([tag for _, tag in pos_tagged_text if tag == "POS"])
        
        contraction_count += genetive_count
        
    return contraction_count

def word_without_vowels_count(word_tokenized_text: List[str], include_y_as_vowel: bool = False) -> int:
    """Returns the count of words in the input string that do not contain vowels.
    
    Args:
        - `word_tokenized_text`: The string for which to count the number of words without vowels.
        - `include_y_as_vowel`: A boolean used to determine if the character "y" should be counted as a vowel.
        
    Returns:
        The number of times a word without vowels occurs in the input string.
    """
    
    word_without_vowels_count = 0
    vowels = set("aeiou")
    
    if include_y_as_vowel:
        vowels.add("y")
    
    word_tokenized_text = _remove_punctuation(word_tokenized_text)
    word_tokenized_text = _remove_digits(word_tokenized_text)
    
    for word in word_tokenized_text:        
        if len(vowels.intersection(word)) == 0:
            word_without_vowels_count += 1
    
    return word_without_vowels_count

def hapax_legomenon_count(word_tokenized_text: List[str]) -> int:
    """Returns the count of hapax legomenon in the input text.
    A hapax legomenon is a word that occurs only once in a corpus.
    Note that for the purposes of this function, the corpus is considered to be the input text.
    
    Args:
        - `word_tokenized_text`: The string for which to count the number of hapax legomenon.
        
    Returns:
        The number of hapax legomena that were found in the input text.
    """
    
    hapax_legomenon_count = 0
    
    word_tokenized_text = [word.lower() for word in word_tokenized_text]
        
    for word in word_tokenized_text:      
        if word_tokenized_text.count(word) == 1:
            hapax_legomenon_count += 1
            
    return hapax_legomenon_count

### 3.2 Text complexity features

Text complexity features aim to describe the complexity of the given text. This may be at the lexical or the syntactical level.

In [None]:
def mean_word_length(word_tokenized_text: List[str]) -> int:   
    word_lengths = [len(word) for word in word_tokenized_text]
    
    return round(statistics.mean(word_lengths), 3)

def mean_sentence_length(word_tokenized_text: List[str]) -> int:   
    sentence_lengths = [len(sentence) for sentence in word_tokenized_text]
    
    return round(statistics.mean(sentence_lengths), 3)

def word_length_standard_deviation(word_tokenized_text: List[str]) -> int:    
    word_lengths = [len(word) for word in word_tokenized_text]
    
    return round(statistics.stdev(word_lengths), 3)

def sentence_length_standard_deviation(word_tokenized_text: List[str]) -> int:    
    sentence_lengths = [len(sentence) for sentence in word_tokenized_text]
    
    return round(statistics.stdev(sentence_lengths), 3)

def mean_word_frequency(word_tokenized_text: List[str]) -> int:    
    word_frequencies = {}
    
    for word in word_tokenized_text:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
                        
    return round(statistics.mean(word_frequencies.values()), 3)

def lexical_diversity_coefficient(word_tokenized_text: List[str]) -> int:
    """From http://repository.utm.md/handle/5014/20225"""
    total_word_count = word_count(word_tokenized_text)
    unique_word_count = hapax_legomenon_count(word_tokenized_text)
    
    lexical_diversity_coefficient = unique_word_count / total_word_count
    
    return round(lexical_diversity_coefficient, 3)

def syntactic_complexity_coefficient(word_tokenized_text: List[str]) -> int:
    """From http://repository.utm.md/handle/5014/20225"""
    total_word_count = word_count(word_tokenized_text)
    total_sentence_count = sentence_count(word_tokenized_text)
    
    syntactic_complexity_coefficient = 1 - total_sentence_count / total_word_count
    
    return round(syntactic_complexity_coefficient, 3)

def herdans_log_type_token_richness(word_tokenized_text: List[str]) -> int:
    """From https://pubs.asha.org/doi/abs/10.1044/jshr.3203.536"""
    total_word_count = word_count(word_tokenized_text)
    unique_word_count = hapax_legomenon_count(word_tokenized_text)
    
    herdans_log_type_token_richness = math.log(unique_word_count) / math.log(total_word_count)
    
    return round(herdans_log_type_token_richness, 3)

### 3.3 Part-of-speech features

These features describe the ratios of words that belong to a specific part-of-speech.

In [None]:
def _pos_tag_ratio(text_word_tokenized: List[str], pos_tags: List[str]) -> int:
    pos_tagged_text = nltk.pos_tag(text_word_tokenized)
    total_word_count = len(pos_tagged_text)
    pos_tag_count = 0
    
    for _, tag in pos_tagged_text:
        if tag in pos_tags:
            pos_tag_count += 1
            
    return pos_tag_count / total_word_count

def noun_common_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("NN", "NNS"))

def noun_proper_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("NNP", "NNPS"))

def adjective_base_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("JJ"))

def adjective_comparative_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("JJR"))

def adjective_superlative_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("JJS"))

def adverb_base_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("RB"))

def adverb_comparative_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("RBR"))

def adverb_superlative_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("RBS"))

def verb_infinitive_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VB"))

def verb_present_tense_1st_2nd_person_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VBP"))

def verb_present_tense_3rd_person_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VBZ"))

def verb_past_tense_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VBD"))

def verb_present_participle(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VBG"))

def verb_past_participle(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("VBN"))

def verb_modal_auxiliary(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("MD"))

def pronoun_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("PRP", "PRP$"))

def genetive_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("POS"))

def interjection_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("UH"))

def foreign_word_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("FW"))

def numeral_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("CD"))

def parenthesis_ratio(text_word_tokenized: List[str]) -> int:
    return _pos_tag_ratio(text_word_tokenized, ("(", ")"))

## 4. Analytics base table

The analytics base table, ABT for short, is the table that contains all the extracted features for all instances of the dataset. The analytics base table is fed to a machine learning algorithm in order to train a model. Here, the ABT for the fanfiction dataset is constructed.

In [10]:
def extract_abt(dataset: pd.DataFrame) -> pd.DataFrame:
    features = []
    
    dataset["text_word_tokenized"] = dataset["text"].apply(nltk.word_tokenize)
    dataset["text_sent_tokenized"] = dataset["text"].apply(nltk.sent_tokenize)
    
    features.append(dataset["text"].apply(character_count))
    features.append(dataset["text"].apply(word_count))
    features.append(dataset["text"].apply(punctuation_count))
    features.append(dataset["text"].apply(digit_count))
    features.append(dataset["text"].apply(uppercase_count))
    features.append(dataset["text_word_tokenized"].apply(short_word_count))
    features.append(dataset["text"].apply(alphabet_count))
    features.append(dataset["text_word_tokenized"].apply(word_without_vowels_count))
    features.append(dataset["text"].apply(contraction_count))
    features.append(dataset["text_word_tokenized"].apply(hapax_legomenon_count))
    features.append(dataset["text_word_tokenized"].apply(mean_word_length))
    features.append(dataset["text_sent_tokenized"].apply(mean_sentence_length))
    features.append(dataset["text_word_tokenized"].apply(word_length_standard_deviation))
    features.append(dataset["text_sent_tokenized"].apply(sentence_length_standard_deviation))
    features.append(dataset["text_word_tokenized"].apply(mean_word_frequency))
    features.append(dataset["text_word_tokenized"].apply(lexical_diversity_coefficient))
    features.append(dataset["text_word_tokenized"].apply(syntactic_complexity_coefficient))
    features.append(dataset["text_word_tokenized"].apply(herdans_log_type_token_richness))
    features.append(dataset["text_word_tokenized"].apply(noun_common_ratio))
    features.append(dataset["text_word_tokenized"].apply(noun_proper_ratio))
    features.append(dataset["text_word_tokenized"].apply(adjective_base_ratio))
    features.append(dataset["text_word_tokenized"].apply(adjective_comparative_ratio))
    features.append(dataset["text_word_tokenized"].apply(adjective_superlative_ratio))
    features.append(dataset["text_word_tokenized"].apply(adverb_base_ratio))
    features.append(dataset["text_word_tokenized"].apply(adverb_comparative_ratio))
    features.append(dataset["text_word_tokenized"].apply(adverb_superlative_ratio))
    features.append(dataset["text_word_tokenized"].apply(verb_infinitive_ratio))
    features.append(dataset["text_word_tokenized"].apply(verb_present_tense_1st_2nd_person_ratio))
    features.append(dataset["text_word_tokenized"].apply(verb_present_tense_3rd_person_ratio))
    features.append(dataset["text_word_tokenized"].apply(verb_past_tense_ratio))
    features.append(dataset["text_word_tokenized"].apply(verb_present_participle))
    features.append(dataset["text_word_tokenized"].apply(verb_past_participle))
    features.append(dataset["text_word_tokenized"].apply(verb_modal_auxiliary))
    features.append(dataset["text_word_tokenized"].apply(pronoun_ratio))
    features.append(dataset["text_word_tokenized"].apply(genetive_ratio))
    features.append(dataset["text_word_tokenized"].apply(interjection_ratio))
    features.append(dataset["text_word_tokenized"].apply(foreign_word_ratio))
    features.append(dataset["text_word_tokenized"].apply(numeral_ratio))
    features.append(dataset["text_word_tokenized"].apply(parenthesis_ratio))
    
    feature_df = pd.DataFrame(features).T
    
    abt = pd.concat(dataset.index, feature_df, dataset["author"])
    
    return abt

abt = extract_abt(train_data)

KeyboardInterrupt: 

In [None]:
abt.to_csv(os.path.join(data_dir, "abt.csv"))

# TODO: Change POS tag functions so that it's 1 function
#       instead of a whole bunch of functions
#       cuz now I call the same function way too fucking times
#       and that takes longer than the heat death of the sun