In [None]:
import pandas as pd
import numpy as np
from scipy.linalg import norm

In [4]:
import plotly_express as px
import seaborn as sns

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [6]:
sns.set()

In [9]:
config = configparser.ConfigParser()
config.read("C:/Users/chris/Documents/UVA/DS_5001/env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [10]:
import sys
sys.path.append(local_lib)

In [11]:
source_files = f'{data_home}/gutenberg'
# data_prefix = 'eliot'

In [12]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']

In [13]:
output_dir = config['DEFAULT']['output_dir']
out_path = f'{output_dir}'

In [14]:
LIB = pd.read_csv(f'{output_dir}/shelley-LIB.csv').set_index('book_id')
VOCAB = pd.read_csv(f'{output_dir}/shelley-VOCAB.csv').set_index('term_str')
CORPUS = pd.read_csv(f'{output_dir}/shelley-CORPUS.csv').set_index(OHCO)

In [15]:
# custom BOW function
def create_BOW(corpus, ohco_level):
    OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
    
    bags = dict(
        SENTS = OHCO[:4],
        PARAS = OHCO[:3],
        CHAPS = OHCO[:2],
        BOOKS = OHCO[:1]
    )
    
    BOW = corpus.groupby(bags[ohco_level]+['term_str']).term_str.count().to_frame('n') 
    
    return BOW

In [16]:
def gather_docs(CORPUS, ohco_level, term_col='term_str'):
    OHCO = CORPUS.index.names
    CORPUS[term_col] = CORPUS[term_col].astype('str')
    DOC = CORPUS.groupby(OHCO[:ohco_level])[term_col].apply(lambda x:' '.join(x)).to_frame('doc_str')
    return DOC

In [22]:
def TFIDF_matrix(BOW, tf_method, CORPUS, ocholevel = 2):

    # DTCM = BOW.n.unstack(fill_value=0)
    DTCM = BOW.n.groupby(level=[0, 1, 2]).sum().unstack(fill_value=0)
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    DF = DTCM.astype('bool').sum() 
    N = DTCM.shape[0]
    IDF = IDF = np.log2(N / DF)
    TFIDF = TF * IDF

    DOC = gather_docs(CORPUS, 2)
    DOC['n_tokens'] = DOC.doc_str.apply(lambda x: len(x.split()))
    ngram_range = (1,2)
    n_terms = 4000
    count_engine = CountVectorizer(
        stop_words = 'english',
        ngram_range = ngram_range,
        max_features = n_terms)
    X = count_engine.fit_transform(DOC.doc_str)
    DTM = pd.DataFrame(X.toarray(), 
    columns=count_engine.get_feature_names_out(), 
    index=DOC.index)
    VOCAB = DTM.sum().to_frame('n')
    VOCAB.index.name = 'term_str'
    VOCAB['df'] = DTM.astype(bool).sum()
    VOCAB['dfidf'] = VOCAB.df * np.log2(len(DTM)/VOCAB.df)
    VOCAB['dp'] = VOCAB.df / len(DTM)
    VOCAB['di'] = np.log2(1/VOCAB.dp)
    VOCAB['dh'] = VOCAB.dp * VOCAB.di
    VOCAB['n_chars'] = VOCAB.apply(lambda x: len(x.name), 1)
    VOCAB['n_tokens'] = VOCAB.apply(lambda x: len(x.name.split()), 1)
    VOCAB.sort_index()
    
    return TFIDF, DTM

In [17]:
BOW = create_BOW(CORPUS, 'CHAPS')

In [26]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
134,14,appendix,1
134,17,1,1
134,17,2,2
134,17,a,26
134,17,abortion,1
...,...,...,...
41445,23,your,36
41445,23,yours,1
41445,23,yourself,3
41445,23,yourselves,1


In [23]:
TFIDF, DTM = TFIDF_matrix(BOW, 'sum', CORPUS)

In [27]:
TFIDF

Unnamed: 0_level_0,term_str,1,10,100,102,103,105,10th,11,116,11th,...,zephyrs,zest,zone,à,æolian,æra,être,œdipus,œrta,ας
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
134,14,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,17,0.005314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,18,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,19,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,20,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41445,19,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41445,20,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41445,21,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41445,22,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
DTM

Unnamed: 0_level_0,term_str,1819,abandoned,abhorrence,abilities,abject,able,abode,abroad,absence,absent,...,yielding,yoke,young,young man,young people,young woman,younger,youth,youthful,zeal
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
134,14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,18,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,19,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
134,20,0,0,0,1,0,0,1,0,1,0,...,0,0,1,1,0,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41445,19,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41445,20,0,0,0,0,0,0,2,0,0,0,...,0,0,2,2,0,0,0,0,1,0
41445,21,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
41445,22,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:

VOCAB_REDUCED = VOCAB[VOCAB.max_pos.isin(['NN', 'NNS']) & ~VOCAB.max_pos.isin(['NNP'])].sort_values('dfidf', ascending=False).head(1000)
TFIDF_REDUCED = TFIDF[VOCAB_REDUCED.index]
TFIDF_REDUCED

Unnamed: 0_level_0,term_str,conversation,passions,point,months,word,door,understanding,peace,solitude,order,...,gates,faith,trade,sailors,falsehood,bounds,brain,thousands,ice,plains
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
134,14,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
134,17,0.000000,0.001628,0.000000,0.001628,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
134,18,0.000663,0.000663,0.000663,0.000663,0.000000,0.000000,0.001325,0.000000,0.000651,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.001474,0.0,0.000000,0.000000
134,19,0.000865,0.000865,0.000000,0.000000,0.000000,0.000865,0.000000,0.000000,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
134,20,0.001122,0.001682,0.000000,0.000000,0.000000,0.000000,0.001122,0.000000,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.001248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41445,19,0.000000,0.000418,0.000418,0.000835,0.000000,0.002088,0.000000,0.000821,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
41445,20,0.000000,0.000000,0.000393,0.001571,0.000000,0.000786,0.000393,0.000000,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
41445,21,0.000872,0.000436,0.000000,0.000436,0.000436,0.000000,0.000000,0.002572,0.000429,0.0,...,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000970,0.000000
41445,22,0.000000,0.000000,0.000000,0.001147,0.000000,0.000000,0.000000,0.001128,0.000000,0.0,...,0.0,0.00000,0.0,0.000000,0.001276,0.0,0.000000,0.0,0.001276,0.000000


In [32]:
BOW.to_csv(f'{output_dir}/shelley-BOW.csv')
TFIDF.to_csv(f'{output_dir}/shelley-TFIDF.csv')
DTM.to_csv(f'{output_dir}/shelley-DTM.csv')
VOCAB_REDUCED.to_csv(f'{output_dir}/shelley-VOCAB-REDUCED.csv')
TFIDF_REDUCED.to_csv(f'{output_dir}/shelley-TFIDF-REDUCED.csv')