# Visualization notebook

This notebook is meant for visualizing stuff and testing code. 


## Imports and functions

In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import warnings
import json
import jsonlines
import os
from pathlib import Path
from dataclasses import dataclass
from gram2vec.featurizers import GrammarVectorizer
warnings.filterwarnings("ignore", category=FutureWarning)


# MUD: 1,071,477 authors

### Plotting by bin

In [4]:
bin_nums = "1 2 3 4 5 6 7 8".split()

dev_bin_avg_docs = []




# manually recorded
HALF_FEATS_ACCS = [0.0, 0.02857142857142857, 0.08571428571428572, 0.11428571428571428, 0.22857142857142856, 0.22857142857142856, 0.22857142857142856, 0.4]
ALL_FEATS_ACCS = [0.0, 0.05714285714285714, 0.11428571428571428, 0.17142857142857143, 0.2, 0.2, 0.2857142857142857, 0.5428571428571428]


df = pd.DataFrame.from_dict({"half_features": HALF_FEATS_ACCS,
                             "all_features" : ALL_FEATS_ACCS,
                             "bin":bin_nums})


fig = px.line(df, x="bin", y=df.columns[0:2], template="plotly_dark", markers=True,)
fig.update_layout(title_text="Accuracy per bin", title_x=0.5)



# Summary stats

- Mean/std \# of tokens per document
- Mean/std \# of tokens per author
- Mean/std \# of documents per author

CODE BELOW IS OUTDATED

In [5]:
def load_json(path):
    with open(path, "r") as fin:
        data = json.load(fin)
        return data

data = load_json("data/pan/preprocessed/fixed_sorted_author.json")

def get_document_token_counts(data:dict[str, list]):
    token_counts = []
    for author_id in data.keys():
        for doc in data[author_id]:
            doc_tokens = doc.split()
            token_counts.append(len(doc_tokens))
    return token_counts

doc_token_counts = get_document_token_counts(data) 
print("Mean/std tokens per document")      
print(np.mean(doc_token_counts))
print(np.std(doc_token_counts))


def get_author_token_counts(data:dict[str, list[str]]) -> list[int]:
    
    author_to_token_counts = {}
    for author_id in data.keys():
        token_counts = []
        for doc in data[author_id]:
            doc_tokens = doc.split()
            token_counts.append(len(doc_tokens))
            
        author_to_token_counts[author_id] = sum(token_counts)
            
    return list(author_to_token_counts.values())


author_token_counts = get_author_token_counts(data)

print("\nMean/std tokens per author")
print(np.mean(author_token_counts))
print(np.std(author_token_counts))


def get_num_docs_per_author(data:dict[str, list[str]]) -> list[int]:
    author_to_doc_freq = {}
    for author_id in data.keys():
        author_to_doc_freq[author_id] = len(data[author_id])
    return list(author_to_doc_freq.values())

docs_per_author = get_num_docs_per_author(data)

print("\nMean/std document frequency per author")
print(np.mean(docs_per_author))
print(np.std(docs_per_author))


Mean/std tokens per document
247.99808978032473
220.32835614956986

Mean/std tokens per author
4636.678571428572
1772.7924311172276

Mean/std document frequency per author
18.696428571428573
6.114735255503541


# The following cells are for generating my CSE 564 dataset

In [64]:
@dataclass
class AuthorEntry:
    author_id:str
    discourse_type:str
    fixed_text:str

def iter_author_jsonls(author_files_dir:str) -> str:
    """Yields each {author_id}.jsonl from a given dir"""
    for author_file in Path(author_files_dir).glob("*.jsonl"):
        yield author_file
        
def iter_author_entries(author_file):
    """Yields each JSON object from an {author_id}.jsonl file"""
    with jsonlines.open(author_file) as author_entries:
        for entry in author_entries:
            yield entry
            
def get_all_entries(data_dir:str) -> list[AuthorEntry]:
    """Extracts author file entries as AuthorEntry objects and aggregates into one list"""
    all_entries = []
    for author_file in iter_author_jsonls(data_dir):
        for entry in iter_author_entries(author_file):
            all_entries.append(AuthorEntry(entry["author_id"], entry["discourse_type"], entry["fixed_text"]))
    return all_entries

data_dir = "data/pan22/preprocessed/"
all_entries = get_all_entries(data_dir)

documents = [entry.fixed_text for entry in all_entries]
authors = [entry.author_id for entry in all_entries]
discourse_types = [entry.discourse_type for entry in all_entries]

g2v = GrammarVectorizer()

feature_vectors = g2v.vectorize_episode(documents)




In [8]:
def get_vocab(path) -> list[str]:
    with open(path, "r") as fin:
        return fin.read().strip().split("\n")

pos_unigrams  = get_vocab("vocab/static/pos_unigrams.txt")
pos_bigrams   = get_vocab("vocab/non_static/pos_bigrams/pan/pos_bigrams.txt")
func_words    = get_vocab("vocab/static/function_words.txt")
punc          = get_vocab("vocab/static/punc_marks.txt")
letters       = get_vocab("vocab/static/letters.txt")
common_emojis = get_vocab("vocab/static/common_emojis.txt")
doc_stats     = ["short_words", "large_words", "word_len_avg", "word_len_std", "sent_len_avg", "sent_len_std", "hapaxes"]
deps          = get_vocab("vocab/static/dep_labels.txt")
mixed_bigrams = get_vocab("vocab/non_static/mixed_bigrams/pan/mixed_bigrams.txt")

all_features = pos_unigrams + pos_bigrams + func_words + punc + letters + common_emojis + doc_stats + deps + mixed_bigrams

def convert_feature_name(feature:str, seen_i, seen_a, seen_X) -> str:
    """
    Hard coded way of making certain conflicting feature names unique.
    Needs to be done to ensure each feature in data viz dataset is unique
    """
    if feature == "i": 
        feature = "i (func_word)" if not seen_i else "i (letter)"
            
    elif feature == "a":
        feature = "a (func_word)" if not seen_a else "a (letter)"
        
    elif feature == "X":
        feature = "X (pos_unigram)" if not seen_X else "X (letter)"
        
    return feature
        

def make_feature_to_counts_map(all_features:list[str]) -> dict[str,list]:
    """
    Maps each feature to an empty list. Accounts for DIFFERENT features with the SAME label
    
    i.e. some distinct features have the same labels ("i", "a", "X"), so for data visualization purposes,
    they need to be renamed to be distinct. This DOES NOT affect the vectors in any way. 
    
    The conditionals here follow the same concatenation order as all_features
    """
    seen_i, seen_a, seen_X = False, False, False
    count_dict = {}
    for feature in all_features:
        if feature == "i":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_i = True
            
        if feature == "a":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_a = True
            
        if feature == "X":
            feature = convert_feature_name(feature, seen_i, seen_a, seen_X)
            seen_X = True
            
        count_dict[feature] = []
    return count_dict

            
def populate_feature_to_counts_map(all_features:list[str], feature_vectors:list) -> dict[str,list[int]]:
    """
    Populates the feature_to_count dict. Accounts for DIFFERENT features with the SAME label
    
    For every feature's count_dict, append the feature name's count number to 
    corresponding list in feats_to_counts
    """
    feats_to_counts = make_feature_to_counts_map(all_features)
    seen_i, seen_a, seen_X = False, False, False
    
    for feature in feature_vectors:
        for count_dict in feature.count_map.values():
            for feat_name, count in count_dict.items():
                
                if feat_name == "i":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_i = True
                    
                if feat_name == "a":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_a = True
                    
                if feat_name == "X":
                    feat_name = convert_feature_name(feat_name, seen_i, seen_a, seen_X)
                    seen_X = True
                       
                feats_to_counts[str(feat_name)].append(count)
        seen_i, seen_a, seen_X = False, False, False # reset flags for every count_dict
            
    return feats_to_counts

features_to_count_lists = populate_feature_to_counts_map(all_features, feature_vectors)

In [39]:
df = pd.DataFrame(features_to_count_lists)
df.insert(0, "author_id", authors)
df.insert(1, "discourse_type", discourse_types)

Unnamed: 0,author_id,discourse_type,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,"('was', 'VERB')","('NOUN', 'is')","('INTJ', 'I')","('VERB', 'your')","('can', 'VERB')","('NOUN', 'was')","('NOUN', 'have')","('an', 'NOUN')","('ADV', 'the')","('n’t', 'VERB')"
0,en_58,email,6,19,13,35,9,19,10,43,...,1,1,1,0,0,1,0,1,1,0
1,en_58,text_message,6,6,6,5,2,2,1,17,...,0,0,0,0,0,0,0,0,0,0
2,en_58,memo,8,12,2,15,0,11,1,27,...,0,2,0,1,0,0,1,0,0,0
3,en_58,text_message,7,6,6,5,1,6,0,20,...,0,0,0,0,1,0,0,0,0,0
4,en_58,text_message,10,9,3,8,4,7,1,22,...,0,0,0,0,1,0,0,0,0,0


In [40]:
try:
    os.chdir("../../cse564/project2/data/")
except:pass

df.to_csv("pan22_features.csv", index=None)

# The following cells get statistics about discourse types

Counter({'email': 508, 'text_message': 390, 'memo': 56, 'essay': 93})

In [28]:
emails = [entry.fixed_text for entry in all_entries if entry.discourse_type == "email"]
txt_msgs = [entry.fixed_text for entry in all_entries if entry.discourse_type == "text_message"]
essays = [entry.fixed_text for entry in all_entries if entry.discourse_type == "essay"]
memos = [entry.fixed_text for entry in all_entries if entry.discourse_type == "memo"]

def get_token_counts(documents:list[str]) -> list[int]:
    return [len(doc.split()) for doc in documents]

print("Email avg tokens: ", np.mean(get_token_counts(emails)))
print("Txt msgs avg tokens: ", np.mean(get_token_counts(txt_msgs)))
print("Essays avg tokens: ", np.mean(get_token_counts(essays)))
print("Memo avg tokens: ", np.mean(get_token_counts(memos)))

    

Email avg tokens:  331.5807086614173
Txt msgs avg tokens:  109.21794871794872
Essays avg tokens:  412.86021505376345
Memo avg tokens:  182.10714285714286


In [61]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

def encode_labels(labels:list[str]) -> list[int]:
    encodings = {"A":100, 
                 "B":200, 
                 "C":300, 
                 "D":400, 
                 "E":500}
    return [encodings[c] for c in labels]

def decode_labels(label_ints:list[int]) -> list[str]:
    decodings = {100:"A",
                 200:"B",
                 300:"C",
                 400:"D",
                 500:"E"}
    return [decodings[i] for i in label_ints]

def get_items_from_indices(labels:list[str], indices:np.ndarray) -> list[str]:
    return [labels[i] for i in indices.squeeze()]


train = np.array([[0.1],[0.2],[0.3],[0.4],[0.5],[0.6],[0.7],[0.8],[0.9],[1.0],[1.1],[1.2],[1.3],[1.4],[1.5],[1.6],[1.7],[1.8],[1.9],[2.0]])
labels = ['A','A','B','A','B','C','A','B','C','D','A','B','C','D','E','A','B','C','D','E']

test = np.array([[1]])


knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(train, encode_labels(labels))


distances, indices = knn.kneighbors(test)
#knn.predict_proba(test_doc)

print(distances)
get_items_from_indices(labels, indices)

[[0.  0.1 0.1 0.2 0.2 0.3 0.3 0.4]]


['D', 'C', 'A', 'B', 'B', 'C', 'A', 'D']

array([[ 9,  8, 10,  7, 11, 12,  6, 13]])