# Visualization notebook

This notebook is meant for visualizing stuff and testing code. 


## Imports and functions

In [1]:
import utils
import plotly.express as px
import numpy as np
import pandas as pd
import warnings
import spacy
import json
warnings.filterwarnings("ignore", category=FutureWarning)


# MUD: 1,071,477 authors

### Plotting by bin

In [None]:
bin_nums = "1 2 3 4 5 6 7 8".split()

dev_bin_avg_docs = []




# manually recorded
HALF_FEATS_ACCS = [0.0, 0.02857142857142857, 0.08571428571428572, 0.11428571428571428, 0.22857142857142856, 0.22857142857142856, 0.22857142857142856, 0.4]
ALL_FEATS_ACCS = [0.0, 0.05714285714285714, 0.11428571428571428, 0.17142857142857143, 0.2, 0.2, 0.2857142857142857, 0.5428571428571428]


 

df = pd.DataFrame.from_dict({"half_features": HALF_FEATS_ACCS,
                             "all_features" : ALL_FEATS_ACCS,
                             "bin":bin_nums})


fig = px.line(df, x="bin", y=df.columns[0:2], template="plotly_dark", markers=True,)
fig.update_layout(title_text="Accuracy per bin", title_x=0.5)



# Summary stats

- Mean/std \# of tokens per document
- Mean/std \# of tokens per author
- Mean/std \# of documents per author

In [2]:
data = utils.load_json("data/pan/preprocessed/fixed_sorted_author.json")

def get_document_token_counts(data:dict[str, list]):
    token_counts = []
    for author_id in data.keys():
        for doc in data[author_id]:
            doc_tokens = doc.split()
            token_counts.append(len(doc_tokens))
    return token_counts

doc_token_counts = get_document_token_counts(data) 
print("Mean/std tokens per document")      
print(np.mean(doc_token_counts))
print(np.std(doc_token_counts))


def get_author_token_counts(data:dict[str, list[str]]) -> list[int]:
    
    author_to_token_counts = {}
    for author_id in data.keys():
        token_counts = []
        for doc in data[author_id]:
            doc_tokens = doc.split()
            token_counts.append(len(doc_tokens))
            
        author_to_token_counts[author_id] = sum(token_counts)
            
    return list(author_to_token_counts.values())


author_token_counts = get_author_token_counts(data)

print("\nMean/std tokens per author")
print(np.mean(author_token_counts))
print(np.std(author_token_counts))


def get_num_docs_per_author(data:dict[str, list[str]]) -> list[int]:
    author_to_doc_freq = {}
    for author_id in data.keys():
        author_to_doc_freq[author_id] = len(data[author_id])
    return list(author_to_doc_freq.values())

docs_per_author = get_num_docs_per_author(data)

print("\nMean/std document frequency per author")
print(np.mean(docs_per_author))
print(np.std(docs_per_author))


Mean/std tokens per document
247.99808978032473
220.32835614956986

Mean/std tokens per author
4636.678571428572
1772.7924311172276

Mean/std document frequency per author
18.696428571428573
6.114735255503541


# The following cells are for generating my potential CSE 564 dataset

In [3]:
from gram2vec.featurizers import GrammarVectorizer


def vectorize_all_data(data:dict, g2v:GrammarVectorizer) -> np.ndarray:
    """Vectorizes a dict of documents. Returns a matrix from all documents"""
    vectors = []
    for author_id in data.keys():
        for text in data[author_id]:
            grammar_vector = g2v.vectorize(text, return_vector=False)
            vectors.append(grammar_vector)
    try:
        return np.stack(vectors)
    except:
        return vectors

def get_authors(data:dict) -> list[int]:
    """Get all instances of authors from data"""
    authors = []
    for author_id in data.keys():
        for _ in data[author_id]:
            authors.append(author_id)
    return authors


data = utils.load_json("data/pan/preprocessed/fixed_sorted_author.json")
g2v = GrammarVectorizer()
authors = get_authors(data)
feature_vectors = vectorize_all_data(data, g2v)

GrammarVectorizer: Old logs detected. Clearing...
GrammarVectorizer: Done


In [31]:
import csv 
from collections import defaultdict, Counter
from more_itertools import all_unique

def get_vocab(path) -> list[str]:
    with open(path, "r") as fin:
        return fin.read().strip().split("\n")

pos_unigrams  = get_vocab("vocab/static/pos_unigrams.txt")
pos_bigrams   = get_vocab("vocab/non_static/pos_bigrams/pan/pos_bigrams.txt")
func_words    = get_vocab("vocab/static/function_words.txt")
punc          = get_vocab("vocab/static/punc_marks.txt")
letters       = get_vocab("vocab/static/letters.txt")
common_emojis = get_vocab("vocab/static/common_emojis.txt")
doc_stats     = ["short_words", "large_words", "word_len_avg", "word_len_std", "sent_len_avg", "sent_len_std", "hapaxes"]
deps          = get_vocab("vocab/static/dep_labels.txt")
mixed_bigrams = get_vocab("vocab/non_static/mixed_bigrams/pan/mixed_bigrams.txt")


#TODO: include discourse type as a categorical variable (do tomorrow 02/20)
all_features = pos_unigrams + pos_bigrams + func_words + punc + letters + common_emojis + doc_stats + deps + mixed_bigrams

#NOTE: need to do this to ensure each feature in data viz dataset is unique


def convert_feature_name(feature:str, seen_i, seen_a, seen_X) -> str:
    """Make certain conflicting feature names unique"""
    if feature == "i": 
        feature = "i (func_word)" if not seen_i else "i (letter)"
            
    elif feature == "a":
        feature = "a (func_word)" if not seen_a else "a (letter)"
        
    elif feature == "X":
        feature = "X (pos_unigram)" if not seen_X else "X (letter)"
        
    return feature
        

def make_feature_to_counts_map(all_features:list[str]) -> dict[str,list]:
    """
    Maps each feature to an empty list. Accounts for DIFFERENT features with the SAME label
    
    i.e. some distinct features have the same labels ("i", "a", "X"), so for data visualization purposes,
    they need to be renamed to be distinct. This DOES NOT affect the vectors in any way. 
    
    The conditionals here follow the same concatenation order as all_features
    """
    seen_i, seen_a, seen_X = False, False, False
    count_dict = {}
    for feature in all_features:
        if feature == "i":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_i = True
            
        if feature == "a":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_a = True
            
        if feature == "X":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_X = True
            
        count_dict[feature] = []
    return count_dict

            
def populate_feature_to_counts_map(all_features:list[str], feature_vectors:list) -> dict[str,list[int]]:
    """
    Populates the feature_to_count dict. Accounts for DIFFERENT features with the SAME label
    
    For every feature's count_dict, append the feature name's count number to 
    corresponding list in feats_to_counts
    """
    feats_to_counts = make_feature_to_counts_map(all_features)
    seen_i, seen_a, seen_X = False, False, False
    
    for feature in feature_vectors:
        for count_dict in feature.count_map.values():
            for feat_name, count in count_dict.items():
                
                if feat_name == "i":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_i = True
                    
                if feat_name == "a":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_a = True
                    
                if feat_name == "X":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_X = True
                       
                feats_to_counts[str(feat_name)].append(count)
        seen_i, seen_a, seen_X = False, False, False # reset flags for every count_dict
            
    return feats_to_counts

features_to_count_lists = populate_feature_to_counts_map(all_features, feature_vectors)

In [42]:

#! REMEMBER TO ADD DISCOURSE TYPES
df = pd.DataFrame(features_to_count_lists)
df.insert(0, "author_id", authors)




In [62]:

from more_itertools import collapse

def get_data(path) -> list[dict]:
    """Reads a series of JSON objects into a list"""
    return [json.loads(line) for line in open(path, "r")]

def load_raw_data(pairs_path:str, truths_path:str):
    """This function loads the raw json data as a list of dicts and extracts each pair"""
    pairs = get_data(pairs_path)
    truths = get_data(truths_path)
    #id_pairs = [tuple(entry["authors"]) for entry in truths]
    doc_pairs = [tuple(entry["pair"]) for entry in pairs]
    discourse_pairs = [tuple(entry["discourse_types"]) for entry in pairs]

    return doc_pairs, discourse_pairs

def get_document_discourse_types(doc_pairs:list[tuple], discourse_pairs:list[tuple]) -> list[str]:
    """Gets all document discourse types. Accounts for duplicate documents seen document pairings"""
    discourse_types = []
    seen_docs = []
    for text_pair, discourse_pair in zip(doc_pairs, discourse_pairs):
        for document, discourse in zip(text_pair, discourse_pair):
            if document not in seen_docs:
                discourse_types.append(discourse)
                seen_docs.append(document)
    return discourse_types
        


doc_pairs, discourse_pairs = load_raw_data("data/pan/raw/pairs.jsonl", "data/pan/raw/truth.jsonl")
discourse_types = get_document_discourse_types(doc_pairs, discourse_pairs)

assert sum(Counter(discourse_types).values()) == 1047
