# Visualization notebook

This notebook is meant for visualizing stuff and testing code. 


## Imports and functions

In [31]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
import json
import pandas as pd
from dataclasses import dataclass
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from gram2vec.featurizers import GrammarVectorizer
warnings.filterwarnings("ignore", category=FutureWarning)

# PAN 2022 Summary Stats

In [49]:
@dataclass
class Author:
    """
    Stores author information in an easy to work with format
    
    :param author_id: unique author id
    :param fixed_texts: list of author documents with regex fixes
    :param raw_texts: list of author documents without regex fixes
    :param discourse_types: list of discourse types
    
    Note: fixed_docs, raw_docs, and discourse_types are all 1 - 1 corresponding
    """
    author_id:str
    fixed_texts:list[str]
    raw_texts:list[str]
    discourse_types:list[str]
    
    def get_token_counts(self) -> list[int]:
        return [len(word_tokenize(author_doc)) for author_doc in self.fixed_texts]
    
    def get_total_docs(self) -> int:
        return len(self.fixed_texts)
    
    def counted_dicourse_types(self):
        return Counter(self.discourse_types)
        
def load_preproccessed_json(path:str) -> dict[str, list[dict]]:
    with open(path, "r") as fin:
        data = json.load(fin)
        return data

def extract_from_dict(author_entry:dict, to_extract:str) -> list[str]:
    return [entry[to_extract] for entry in author_entry]
    
def create_author_list(preprocessed_data:dict[str, list[dict]]) -> list[Author]:
    """
    Converts the preprocessed_data.json data into a list of Author objects
    """
    authors = []
    for author_id in preprocessed_data.keys():
        author_entry = preprocessed_data[author_id]
        fixed_texts = extract_from_dict(author_entry,"fixed_text")
        raw_texts = extract_from_dict(author_entry,"raw_text")
        discourse_types = extract_from_dict(author_entry,"discourse_type")
            
        authors.append(Author(author_id, fixed_texts, raw_texts, discourse_types))
        
    return authors

def get_total_doc_count(authors:list[Author]) -> int:
    """Computes total number of documents"""
    return sum([author.get_total_docs() for author in authors])

def get_total_author_count(authors:list[Author]) -> int:
    """Computes total number of authors"""
    return len(authors)

def get_doc_token_stats(authors:list[Author]) -> tuple[float, float]:
    """Gets the mean and std of tokens per document"""
    all_doc_token_counts = []
    for author in authors:
        all_doc_token_counts.extend(author.get_token_counts())
    return np.mean(all_doc_token_counts), np.std(all_doc_token_counts)
    
def get_author_token_stats(authors:list[Author]) -> tuple[float, float]:
    """Gets the mean and std of tokens per author"""
    author_to_token_counts = defaultdict(list)
    for author in authors:
        author_to_token_counts[author.author_id] = sum(author.get_token_counts())
    values = list(author_to_token_counts.values())
    return np.mean(values), np.std(values)
    
def get_author_doc_stats(authors:list[Author]) -> tuple[float, float, int, int]:
    """Gets mean, std, min, max of documents per author"""
    doc_counts = [author.get_total_docs() for author in authors]
    return np.mean(doc_counts), np.std(doc_counts), min(doc_counts), max(doc_counts)
    
#TODO: rewrite discourse type counting code

In [62]:

data = load_preproccessed_json("data/pan22/preprocessed/preprocessed_data.json")
all_authors = create_author_list(data)

total_doc_count = get_total_doc_count(all_authors)
total_author_count = get_total_author_count(all_authors)

avg_tokens_per_doc, std_tokens_per_doc = get_doc_token_stats(all_authors)
avg_tokens_per_author, std_tokens_per_author = get_author_token_stats(all_authors)
author_avg, author_std, author_min, author_max = get_author_doc_stats(all_authors)


with open("stats.txt", "w") as fout:
    fout.write(f"""
Total docs: {total_doc_count}
Total authors: {total_author_count}
Mean tokens per doc: {avg_tokens_per_doc:.2f}
Std tokens per doc: {std_tokens_per_doc:.2f}
Mean tokens per author: {avg_tokens_per_author:.2f}
Std tokens per author: {std_tokens_per_author:.2f}
Mean # of docs: {author_avg:.2f}
Std # of docs: {author_std:.2f}
Min # of docs: {author_min}
Max # of docs: {author_max}""")


