# TxMM Assignment 4 - Authorship Attribution

This 

## 1. Load data

In [40]:
import os
import string

import pandas as pd
import nltk
import nltk.tokenize.destructive
import contractions

nltk.download("punkt")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

data_dir = os.path.join(os.getcwd(), "Data")

def load_fanfiction_data(data_purpose: str) -> pd.DataFrame:
    path_to_data_csv = os.path.join(data_dir, f"pan2324_{data_purpose}_data.csv")
    
    return pd.read_csv(path_to_data_csv, index_col=0).sort_index()

train_data = load_fanfiction_data("train")
test_data = load_fanfiction_data("test")
dev_data = load_fanfiction_data("dev")

train_data.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daanb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\daanb\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\daanb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Unnamed: 0,text,author
1,"It""s got a lot of action, a lot of heart, come...",1276465
2,Hiro: (blushes in response) AniUniverse: (chuc...,1276465
3,Girl with the Baymax onesie: (stands up) Thank...,1276465
4,"I am too! Anyways, this is where we let the fa...",1276465
5,Do you like sushi? Do you find your name ironi...,1276465


## 2. Feature extraction

For this assignment, a set of different features are used. These features may be split up into a number of feature categories.

In [50]:
my_sentence = "I'm looking for a research project where I can study the language Esperanto from a linguistic perspective. That subject's called Esperantology. I think that's quite nifty :). 10/10 idea. Gabbagool!!!"

### 2.1. Count features

Count features are lexical features that the describe the count of something within the text, like the count of words or sentences in a text.

In [77]:
def character_count(text: str) -> int:
    """Returns the count of all characters of the given parameter string.
    
    Args:
        - `text`: The string to compute the amount of characters of.
        
    Returns:
        The count of all characters in the `text` parameter.
    """
    
    return len(text)

def word_count(text: str) -> int:
    """Returns the count of all words that occur in the input string. This count is determined using NLTK's `word_tokenize` function.
    
    Args:
        - `text`: The string to compute the amount of words of.
        
    Returns:
        The count of all words in the `text` parameter.
    """
    
    return len(nltk.word_tokenize(text))

def sentence_count(text: str) -> int:
    """Returns the count of all sentences that occur in the input string. This count is determined using NLTK's `sent_tokenize` function.
    
    Args:
        - `text`: The string to compute the amount of sentences of.
        
    Returns:
        The count of all sentences in the `text` parameter.
    """
    return len(nltk.sent_tokenize(text))

def punctuation_count(text: str) -> int:
    """Returns the count of punctuation occurrences in the input string.
    
    Args:
        - `text`: The string to compute the amount of punctuation occurrences of.
        
    Returns:
        The number of times a form of punctuation occurs in the `text` parameter.    
    """
    
    punctuation_count = 0
    
    for character in text:
        if character in list(string.punctuation):
            punctuation_count += 1
    
    return punctuation_count

def digit_count(text: str) -> int:
    """Returns the count of individual digits occurring in the input string.
    Note that this does not mean numbers, e.g., "23" will return "2", since the number 23 consists of two digits.
    
    Args:
        - `text`: The string to compute the amount of digit occurrences of.
        
    Returns:
        The number of times a digit occurs in the `text` parameter.
    """
    digit_count = 0
    
    for character in text:
        if character in list(string.digits):
            digit_count += 1
            
    return digit_count

def uppercase_count(text: str) -> int:
    """Returns the count of uppercase characters occurring in the input string.
    Note that only uppercase characters that are part of ASCII are supported.
    
    Args:
        - `text`: The string to compute the amount of uppercase character occurrences of.
        
    Returns:
        The number of times an uppercase character occurs in the `text` parameter.
    """
    uppercase_count = 0
    
    for character in text:
        if character in list(string.ascii_uppercase):
            uppercase_count += 1
            
    return uppercase_count

def short_word_count(text: str, short_word_max_length: int = 4) -> int:
    """Returns the count of "short" words that occur in the `text` parameter.
    The cutoff point for "short" words is given by the `short_word_max_length` parameter.
    Words are extracted using NLTK's `word_tokenizer`.
    
    Args:
        - `text`: The string to compute the amount of "short" words of.
        - `short_word_max_length`: The maximum length of what is considered to be a "short" word. This length is inclusive.
        
    Returns:
        The number of times a "short" word occurs in the `text` parameter.
    """
    
    word_tokenized_text = nltk.word_tokenize(text)
    word_tokenized_text_no_punctuation = [word for word in word_tokenized_text if word not in list(string.punctuation)]
    short_word_list = [word for word in word_tokenized_text_no_punctuation if len(word) <= short_word_max_length]
    
    return len(short_word_list)

def alphabet_count(text: str, count_uppercase: bool = True, count_punctuation: bool = True, count_digits: bool = True) -> int:
    """Returns the length of the alphabet of the given text. A text's alphabet is defined as all unique characters that occur in that text.
    
    Args:
        - `text`: The string for which to compute an alphabet for.
        - `count_uppercase`: A boolean used to determine whether to count uppercase characters as separate from their lowercase counterparts.
        - `count_punctuation`: A boolean used to determine whether to include punctuation in the alphabet.
        - `count_digits`: A boolean used to determine whether to include digits in the alphabet.
        
    Returns:
        The length of the alphabet of the `text` variable.
    """
    if not count_uppercase:
        text = text.lower()
    
    text_char_alphabet = set(text)
    
    if not count_punctuation:
        text_char_alphabet = {char for char in text_char_alphabet if char not in list(string.punctuation)}
        
    if not count_digits:
        text_char_alphabet = {char for char in text_char_alphabet if char not in list(string.digits)}
    
    return len(text_char_alphabet)

def contraction_count(text: str, include_genetive_count: bool = False) -> int:
    """Returns the count of all contractions that occur in the given string.
    
    Args:
        - `text`: The string for which to count the number of occurring contractions.
        - `include_genetive_count`: A boolean used to determine if occurrences of the genetive should count towards the number of contractions found.
        
    Returns:
        The amount of contractions that occur in the `text` variable.
    """
    contraction_count = len(contractions.preview(text, 1))
    
    if include_genetive_count:
        tokenized_text = nltk.tokenize.word_tokenize(text)
        pos_tagged_text = nltk.tag.pos_tag(tokenized_text)
        genetive_count = len([tag for _, tag in pos_tagged_text if tag == "POS"])
        
        contraction_count += genetive_count
        
    return contraction_count

def word_without_vowels_count(text: str, include_y_as_vowel: bool = False) -> int:
    """Returns the count of words in the input string that do not contain vowels.
    
    Args:
        - `text`: The string for which to count the number of words without vowels.
        - `include_y_as_vowel`: A boolean used to determine if the character "y" should be counted as a vowel.
        
    Returns:
        The number of times a word without vowels occurs in the input string.
    """
    
    word_without_vowels_count = 0
    vowels = set("aeiou")
    
    if include_y_as_vowel:
        vowels.add("y")
    
    tokenized_text = nltk.word_tokenize(text.lower())
    tokenized_text = [word for word in tokenized_text if word not in (list(string.punctuation) or list(string.digits))]
    
    for word in tokenized_text:
        vowels_in_word = vowels.intersection(word)
        
        if len(vowels_in_word) == 0:
            word_without_vowels_count += 1
    
    return word_without_vowels_count

def hapax_legomenon_count(text: str) -> int:
    """Returns the count of hapax legomenon in the input text.
    A hapax legomenon is a word that occurs only once in a corpus.
    Note that for the purposes of this function, the corpus is considered to be the input text.
    
    Args:
        - `text`: The string for which to count the number of hapax legomenon.
        
    Returns:
        The number of hapax legomena that were found in the input text.
    """
    
    hapax_legomenon_count = 0
    
    tokenized_text = nltk.word_tokenize(text.lower())
    
    for word in tokenized_text:
        word_count_in_text = tokenized_text.count(word)
        
        if word_count_in_text == 1:
            hapax_legomenon_count += 1
            
    return hapax_legomenon_count

### 2.2 Text complexity features

Text complexity features aim to describe the complexity of the given text. This may be at the lexical or the syntactical level.

In [None]:
def mean_word_length(text: str) -> int:
    raise NotImplementedError()

def mean_sentence_length(text: str) -> int:
    raise NotImplementedError()

def word_length_standard_deviation(text: str) -> int:
    raise NotImplementedError()

def sentence_length_standard_deviation(text: str) -> int:
    raise NotImplementedError()

def mean_word_frequency(text: str) -> int:
    raise NotImplementedError()

def lexical_diversity_coefficient(text: str) -> int:
    raise NotImplementedError()

def syntactic_complexity_coefficient(text: str) -> int:
    raise NotImplementedError()

def herdans_log_type_token_richness(text: str) -> int:
    raise NotImplementedError()

### 2.3 Part-of-speech features

These features describe the ratios of words that belong to a specific part-of-speech.

In [None]:
def _extract_part_of_speech_tags(text: str) -> int:
    raise NotImplementedError()