In [None]:
# Colab only
!rm -rf clas-3801 corpus
!git clone https://github.com/cwf2/clas-3801-fa23
!mv clas-3801-fa23/Week_05/corpus .
!rm -rf clas-3801-fa23

In [None]:
import os
import requests
import pandas as pd
import spacy
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

In [None]:
nlp = spacy.load('en_core_web_sm')

files = [f for f in os.listdir(dir_texts) if not f.startswith('.')]
corpus = []

for filename in files:
    author, title = filename[:-4].split('_', 1)
    path = os.path.join('corpus', filename)
    print(path, end='...')
    with open(path) as f:
        fulltext = f.read()
        doc = nlp(fulltext)
        print(len(doc), 'tokens')
        corpus.append(dict(
            author = author,
            title = title,
            token = [tok for tok in doc],
        ))
corpus = pd.DataFrame(corpus)
display(corpus)

In [None]:
tokens = pd.DataFrame(corpus).explode('token', ignore_index=True)
display(tokens)

In [None]:
tokens['text'] = [tok.text for tok in tokens['token']]
tokens['lemma'] = [tok.lemma_ for tok in tokens['token']]
tokens['pos'] = [tok.pos_ for tok in tokens['token']]
display(tokens)

In [None]:
tokens = tokens.loc[tokens.pos!='PUNCT']
tokens = tokens.loc[tokens.pos!='SPACE']
tokens = tokens.loc[tokens.pos!='PROPN']
tokens = tokens.loc[tokens.text.str.contains(r'[A-Za-z]')]
display(tokens)

In [None]:
pos_freq = pd.crosstab([tokens['author'], tokens['title']], tokens['pos'], normalize='index')*1000
display(pos_freq)

In [None]:
feat_x = 'NOUN'
feat_y = 'VERB'

fig, ax = plt.subplots()
authors = pos_freq.index.get_level_values(0)
for label, group in pos_freq.groupby(authors):
    ax.plot(group[feat_x], group[feat_y], marker='o', ls='', label=label)
    ax.set_xlabel(feat_x)
    ax.set_ylabel(feat_y)
    ax.legend()

In [None]:
tokens.groupby('lemma').agg(count=('token', 'count')).sort_values('count', ascending=False).iloc[:30]

In [None]:
mfw = tokens.groupby('lemma').agg(count=('token', 'count')).sort_values('count', ascending=False).iloc[:100].index.values
print(mfw)

In [None]:
selected = tokens.lemma.isin(mfw)
lemma_count = pd.crosstab([tokens.loc[selected, 'author'],tokens.loc[selected, 'title']], tokens.loc[selected, 'lemma'])
display(lemma_count)

In [None]:
n_lemmas = tokens.groupby('title').agg(
    n=('lemma', 'count'),
    
)
lemma_freq = lemma_count.div(n_lemmas.n, axis=0) * 1000
display(lemma_freq)

In [None]:
lemma_z = lemma_freq.sub(lemma_freq.mean(), axis=1).div(lemma_freq.std(), axis=1)
display(lemma_z)

In [None]:
feat_x = 'and'
feat_y = 'but'

fig, ax = plt.subplots()
for label, group in lemma_z.groupby(authors):
    ax.plot(group[feat_x], group[feat_y], marker='o', ls='', label=label)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
ax.legend()

In [None]:
pca_model = PCA(n_components=3)

In [None]:
pca = pd.DataFrame(
    pca_model.fit_transform(lemma_z),
    index = lemma_z.index,
    columns = ['PC1', 'PC2', 'PC3'],
)
display(pca)

In [None]:
feat_x = 'PC1'
feat_y = 'PC2'

fig, ax = plt.subplots()
for label, group in pca.groupby(authors):
    ax.plot(group[feat_x], group[feat_y], marker='o', ls='', label=label)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
ax.legend()