# Visualization notebook



## Imports and functions

In [2]:
import utils
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
import spacy
from copy import deepcopy
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

# 1,071,477 authors

# Exploring MUD

In [28]:
import ijson
import sqlite3
import random

data = ijson.parse(open("data/mud/full/raw_mud/raw_all/data.jsonl"), multiple_values=True)

i = 0
num_authors = 100

d = {}
posts = []
for prefix, event, value in data:
    
    if prefix == "syms.item":
        posts.append(value)
        
    if prefix.startswith("author_id"):
        d[value] = posts
        posts = []
        i += 1

    if i == num_authors:
        break



author = random.choice(list(d.keys()))


save = {author:d[author]}
print(len(d[author]))

utils.save_json(save, "example_author.json")



519


In [29]:
nlp = utils.load_spacy("en_core_web_md")

text = "This is a string and I eat ice cream!!!"
doc = nlp(text)

tokens = [token.text for token in doc]
pos = [token.pos_ for token in doc]
assert len(tokens) == len(pos)

l = []
for i, token in enumerate(tokens):
    try:
        l.append((token, pos[i+1]))
    except:
        pass
    
# replace open class with tags
l

[('This', 'AUX'),
 ('is', 'DET'),
 ('a', 'NOUN'),
 ('string', 'CCONJ'),
 ('and', 'PRON'),
 ('I', 'VERB'),
 ('eat', 'NOUN'),
 ('ice', 'NOUN'),
 ('cream', 'PUNCT'),
 ('!', 'PUNCT'),
 ('!', 'PUNCT')]

In [3]:
nlp = utils.load_spacy("en_core_web_md")


doc = nlp("This string has four sentences. This previous sentence is false. That next sentence is true! I like cats")

sent_spans = [(sent.start, sent.end) for sent in doc.sents]
pos = [token.pos_ for token in doc]



['This', 'string', 'has', 'four', 'sentences', '.', 'This', 'previous', 'sentence', 'is', 'false', '.', 'That', 'next', 'sentence', 'is', 'true', '!', 'I', 'like', 'cats']


In [6]:


def insert_boundaries(indices:list[tuple], tokens:list):
    """
    This function inserts sentence boundaries to a list of tokens 
    according to a list of (START, END) sentence index markers
    
    Works by enumerating the tokens and checking if each position 
    is the start or end of a sentence, inserting the appropriate tag when
    """
   
    new_tokens = []
    for i, item in enumerate(tokens):
        for start, end in indices:
            if i == start:
                new_tokens.append("BOS")
            elif i == end:
                new_tokens.append("EOS")    
        new_tokens.append(item)
    new_tokens.append("EOS")  
        
    return new_tokens
        
    

print(insert_boundaries(sent_spans, tokens))


['BOS', 'This', 'string', 'has', 'four', 'sentences', '.', 'EOS', 'BOS', 'This', 'previous', 'sentence', 'is', 'false', '.', 'EOS', 'BOS', 'That', 'next', 'sentence', 'is', 'true', '!', 'EOS', 'BOS', 'I', 'like', 'cats', 'EOS']


[2, 3, 1, 4, 2, 1, 1]