# Visualization notebook

This notebook is meant for visualizing stuff and testing code. 

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk import Tree
from spacy import displacy
import spacy
from dataclasses import dataclass
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from featurizers import GrammarVectorizer, make_document
from contextlib import contextmanager
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
@contextmanager
def temp_change_dir(path:str):
    """Changes to a new directory specified by given path, and then reverts back to the old directory"""
    oldpwd = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(oldpwd)
        
def load_pan22() -> pd.DataFrame:
    """Loads PAN22 data as a dataframe"""
    with temp_change_dir("../../"):
        return pd.read_json("data/pan22/preprocessed/pan22_preprocessed.jsonl", lines=True)

In [6]:


def create_vector_dataset(dataset:pd.DataFrame):
    
    
    docs = dataset["fullText"]
    g2v = GrammarVectorizer()
    df = g2v.create_vector_df(docs.to_list())
    df.insert(1, "author_id", dataset["authorIDs"])
    
    
    df.to_csv("../../../authorship_analysis/data/document_vectors.csv", index=None)

In [7]:
pan = load_pan22()

create_vector_dataset(pan)

Unnamed: 0,authorIDs,documentID,fullText,discourse_type,collectionNum,dateCollected,publiclyAvailable,source,deidentified,languages,lengthWords,isForeground
0,en_110,ed5ec66c-d70f-11ed-8cc6-76349838619d,"Barrett, Thank you so much, these were helpful...",email,,,False,university,False,[en],635,False
1,en_112,ed73dd7c-d70f-11ed-8cc6-76349838619d,"ill have a check now, I assumed they all start...",text_message,,,False,university,False,[en],115,False
2,en_112,ed747bba-d70f-11ed-8cc6-76349838619d,"Dear Ania, I am on my placement year of course...",email,,,False,university,False,[en],368,False
3,en_76,ed87d5ca-d70f-11ed-8cc6-76349838619d,before dinner haha Next weekend why x Well? Wh...,text_message,,,False,university,False,[en],126,False
4,en_62,ed8fa796-d70f-11ed-8cc6-76349838619d,Over the last few years EMNE’s have become mor...,essay,,,False,university,False,[en],3013,False
