# Visualization notebook



## Imports and functions

In [1]:
import utils
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
import spacy
from spacy import Language
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

# 1,071,477 authors

# Exploring MUD

In [28]:
import ijson
import sqlite3
import random

data = ijson.parse(open("data/mud/full/raw_mud/raw_all/data.jsonl"), multiple_values=True)

i = 0
num_authors = 100

d = {}
posts = []
for prefix, event, value in data:
    
    if prefix == "syms.item":
        posts.append(value)
        
    if prefix.startswith("author_id"):
        d[value] = posts
        posts = []
        i += 1

    if i == num_authors:
        break



author = random.choice(list(d.keys()))


save = {author:d[author]}
print(len(d[author]))

utils.save_json(save, "example_author.json")



519


In [29]:
nlp = utils.load_spacy("en_core_web_md")

text = "This is a string and I eat ice cream!!!"
doc = nlp(text)

tokens = [token.text for token in doc]
pos = [token.pos_ for token in doc]
assert len(tokens) == len(pos)

l = []
for i, token in enumerate(tokens):
    try:
        l.append((token, pos[i+1]))
    except:
        pass
    
# replace open class with tags
l

[('This', 'AUX'),
 ('is', 'DET'),
 ('a', 'NOUN'),
 ('string', 'CCONJ'),
 ('and', 'PRON'),
 ('I', 'VERB'),
 ('eat', 'NOUN'),
 ('ice', 'NOUN'),
 ('cream', 'PUNCT'),
 ('!', 'PUNCT'),
 ('!', 'PUNCT')]

In [17]:
from spacy import displacy
nlp = utils.load_spacy("en_core_web_md")

text = """This is a sentence. I was wondering whether he and I should go to the park?"""
doc = nlp(text)

labels = nlp.pipe_labels["parser"]


vecs = []
for token in doc:
    vecs.append(token.vector)

np.mean(vecs, axis=0) == doc.vector


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [16]:
doc.vector

array([-2.2731838e+00,  3.6058960e+00, -3.3113937e+00, -1.8773474e+00,
        8.3632797e-01,  6.1170608e-01,  4.9491775e-01,  4.3823366e+00,
       -1.8666888e+00,  2.4825881e+00,  6.9578261e+00,  2.0415430e+00,
       -2.3739080e+00,  4.6799946e-01,  1.5287519e+00,  1.4221318e+00,
        1.7235076e+00, -2.1763837e+00, -2.4344490e+00, -2.0245967e+00,
        2.3184450e+00, -7.3679447e-01, -7.5279333e-02, -1.6977003e+00,
       -2.4081812e+00, -2.4943292e+00, -3.8142602e+00, -1.5184183e+00,
       -1.8877225e+00,  1.2409444e+00,  4.3822724e-01, -9.6723175e-01,
       -7.7480084e-01, -2.4900723e-01, -1.6841015e+00, -4.1133888e-02,
       -2.2853963e-01,  2.1446007e-01,  2.7201877e+00,  1.0776255e+00,
       -1.3120370e+00,  1.7148088e+00,  1.7439364e+00, -1.4167073e+00,
       -6.4397687e-01,  1.8910664e+00,  5.0272059e-01, -4.8849506e+00,
       -1.5670620e+00,  2.5801084e+00, -3.6327058e-01,  4.4041723e-01,
        1.2711728e+00, -4.5869837e+00, -1.0297160e+00, -1.4488143e+00,
      

In [48]:
"string"

"st", "sr", "si", "sn", "sg", "tr" ...

['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']
