# Metadata

```yaml
Course:    DS 5001 
Email:     tdj5xk@virginia.edu
Author:    Chris Longchamp
Date:      2 May 2023
```


## Preprocessing Data

In [1]:
data_home = "../DS5001"
local_lib = "../DS5001/lib"
source_files = f'{data_home}/NLPProjectGutenberg/final-set'
data_prefix = 'final'

### Importing Necessary Libraries

In [47]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import sys
sys.path.append(local_lib)
from textparser import TextParser
from sklearn.decomposition import PCA
from scipy.linalg import norm
from scipy.linalg import eigh
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px

### Setting Chapter Pats

In [3]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (805,   rf"^\s*CHAPTER\s\d+\. | INTERLUDE"),
    (4368,  rf"^\s*CHAPTER\s+{roman}$"),
    (64317,  rf"^\s*{roman}$"),
    (6695, rf"^^\s*CHAPTER\s+{roman}"),
    (144, rf"^\s*CHAPTER\s+{roman}"),
    (1245, rf"^\s*CHAPTER\s+{roman}"),
    (5670, rf"^\s*CHAPTER\s"),
    (29220, rf"^\b[A-Z\s]+\b$"),
    (61085, rf"^\s*chapter\s*\d+\s*"),
    (63022, rf"^\s*Chapter\s\d+"),
    (63107, rf"^[MRS DALLOWAY IN BOND STREET\s]+$"),
    (67138, rf"^\s*CHAPTER\s+\d+$"),
    (69683, rf"^[A-Z\s]+$")
]

In [4]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [5]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('\\')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

### Creating LIB Table

In [6]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [7]:
LIB.shape

(13, 2)

In [8]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [9]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [10]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
144,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",THE VOYAGE OUT,^\s*CHAPTER\s+[IVXLCM]+
805,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",THIS SIDE OF PARADISE,^\s*CHAPTER\s\d+\. | INTERLUDE
1245,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",NIGHT AND DAY,^\s*CHAPTER\s+[IVXLCM]+
4368,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",FLAPPERS AND PHILOSOPHERS,^\s*CHAPTER\s+[IVXLCM]+$
5670,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",JACOBS ROOM,^\s*CHAPTER\s
6695,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",TALES OF THE JAZZ AGE,^^\s*CHAPTER\s+[IVXLCM]+
29220,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MONDAY OR TUESDAY,^\b[A-Z\s]+\b$
61085,../DS5001/NLPProjectGutenberg/final-set\ERNEST...,"ERNEST, HEMINGWAY",IN OUR TIME,^\s*chapter\s*\d+\s*
63022,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MR BENNETT AND MRS BROWN,^\s*Chapter\s\d+
63107,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MRS DALLOWAY IN BOND STREET,^[MRS DALLOWAY IN BOND STREET\s]+$


In [46]:
LIB['author_key'] = LIB.author.str.split(', ').str[0].str.lower()
AUTHORS = sorted(LIB.author_key.value_counts().index.to_list())

In [11]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

### Tokenizing the Corpus

In [12]:
CORPUS = tokenize_collection(LIB)

Tokenizing 144 THE VOYAGE OUT
Importing  ../DS5001/NLPProjectGutenberg/final-set\VIRGINIA_WOOLF_THE_VOYAGE_OUT-pg144.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s+[IVXLCM]+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 805 THIS SIDE OF PARADISE
Importing  ../DS5001/NLPProjectGutenberg/final-set\F.SCOTT_FITZGERALD_THIS_SIDE_OF_PARADISE-pg805.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s\d+\. | INTERLUDE
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 1245 NIGHT AND DAY
Importing  ../DS5001/NLPProjectGutenberg/final-set\VIRGINIA_WOOLF_NIGHT_AND_DAY-pg1245.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\

In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
144,1,1,0,0,"(As, IN)",IN,As,as
144,1,1,0,1,"(the, DT)",DT,the,the
144,1,1,0,2,"(streets, NNS)",NNS,streets,streets
144,1,1,0,3,"(that, WDT)",WDT,that,that
144,1,1,0,4,"(lead, VBP)",VBP,lead,lead
...,...,...,...,...,...,...,...,...
69683,14,89,10,15,"(it, PRP)",PRP,it,it
69683,14,89,10,16,"(would, MD)",MD,would,would
69683,14,89,10,17,"(fix, VB)",VB,fix,fix
69683,14,89,10,18,"(up, RP)",RP,up,up


## Vocabulary Table

In [14]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)

### Adding POS Group

In [15]:
pos_info = 'upenn_tagset.txt'
POS = pd.DataFrame([(line.split()[0], ' '.join(line.split()[1:]))
                    for line in open(pos_info, 'r').readlines()])
POS.columns = ['pos_id', 'pos_def']
POS = POS.set_index('pos_id')
POS['pos_group'] = POS.apply(lambda x: x.name[:2], 1)

### Adding Max POS Group

In [16]:
VOCAB['max_pos_group']=VOCAB.max_pos.apply(lambda x: x[:2])
CORPUS['pos_group']=CORPUS.pos.apply(lambda x: x[:2])

### Adding Stop Words

In [17]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

## Bag of Words and TFIDF

In [20]:
def BOW(corpus, bag):
    '''
    This function takes in a corpus DataFrame and returns a bag of words table based on the OHCO level defined by the user. 
    
    INPUTS:
    
    corpus: A tokens dataframe which can be a filtered version of the dataframe you import. 
    This will be the CORPUS table or some subset of it.
    
    bag: A choice of bag, i.e. OHCO level, such as book, chapter, paragraph, or sentence.
    
    OUTPUTS:
    
    BOW = A Bag of Words Dataframe grouped by the appropriate OHCO level. 
    '''    
    BOW = corpus.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return BOW

In [30]:
def TFIDF(BOW, kind, measure):
    '''
    This function calculated the TFIDF values for a given bag of words based on which type of TFIDF specified by the user.
    
    INPUTS:
    BOW: Bag of Words Table in Dataframe format
    
    kind: Which kind of TF calculated i.e. sum, max, log.
    
    measure: Specifies whether to return the TFIDF values or the DFIDF values
    
    OUTPUTS:
    TFIDF: A Dataframe of the specified TFIDF calculation grouped by the respective bag   
    '''
    
    DTCM = BOW.n.unstack()
    
    if kind == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif kind == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif kind == 'log':
        TF = np.log2(1 + DTCM.T)
    elif kind == 'raw':
        TF = DTCM.T
    elif kind == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif kind == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    elif kind == 'none':
        pass
    else:
        print('Please specify an approriate TF kind')
    
    DF = DTCM.count()
    N = DTCM.shape[0]
    IDF = np.log2(N / DF)
    
    if measure == 'dfidf':
        DFIDF = DF * IDF
        return DFIDF
    elif measure == 'tfidf':
        TF = TF.T
        TFIDF = TF * IDF
        BOW_Copy = BOW.copy()
        BOW_Copy['tf'] = TF.stack()
        BOW_Copy['tfidf'] = TFIDF.stack()
        return BOW_Copy

### Setting OCHO Index

In [31]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

### Getting Bag of Words by Chapter

In [32]:
bag = BOOKS
BOW_1 = BOW(CORPUS, bag)
BOW_1

Unnamed: 0_level_0,Unnamed: 1_level_0,n
book_id,term_str,Unnamed: 2_level_1
144,,15
144,112,1
144,1580,1
144,1660,1
144,1852,1
...,...,...
69683,yucatan,1
69683,yuh,1
69683,zigzag,1
69683,zurito,88


In [34]:
tfidf_max = TFIDF(BOW_1, 'max', 'tfidf')
tfidf_max

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tf,tfidf
book_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
144,,15,0.002088,0.000503
144,112,1,0.000139,0.000515
144,1580,1,0.000139,0.000515
144,1660,1,0.000139,0.000515
144,1852,1,0.000139,0.000515
...,...,...,...,...
69683,yucatan,1,0.000323,0.001195
69683,yuh,1,0.000323,0.000872
69683,zigzag,1,0.000323,0.000683
69683,zurito,88,0.028424,0.105180


In [35]:
DFIDF = TFIDF(BOW_1, 'max', 'dfidf')

In [36]:
VOCAB['tfidf_mean'] = tfidf_max.groupby('term_str').mean('tfidf')['tfidf']
VOCAB['dfidf'] = DFIDF
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,max_pos_group,stop,tfidf_mean,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,473,0,0.000607,10.686917,NN,12,NN,0,0.003575,2.651089
03,1,2,0.000001,19.572613,NNP,1,NN,0,0.001366,3.700440
1,5,1,0.000006,17.250685,CD,2,CD,0,0.004580,5.400879
10,1,2,0.000001,19.572613,CD,1,CD,0,0.000914,3.700440
1030,1,4,0.000001,19.572613,CD,1,CD,0,0.001195,3.700440
...,...,...,...,...,...,...,...,...,...,...
τὰ,1,2,0.000001,19.572613,NNP,1,NN,0,0.000515,3.700440
χειμερίῳ,1,8,0.000001,19.572613,NNP,1,NN,0,0.000515,3.700440
χωρεῖ,1,5,0.000001,19.572613,NNP,1,NN,0,0.000515,3.700440
ἀν,1,2,0.000001,19.572613,NNP,1,NN,0,0.000515,3.700440


In [40]:
TFIDF_TABLE = tfidf_max['tfidf'].unstack(fill_value=0) 
VSHORT = VOCAB[VOCAB.max_pos.isin(['NN', 'NNS'])].sort_values('dfidf', ascending=False).head(1000)
TFIDF_TABLE = TFIDF_TABLE[VSHORT.index]
TFIDF_TABLE

term_str,salad,architecture,cock,spray,eyelashes,repair,paving,extravagance,spun,cocktail,...,rustle,scales,insect,infirmary,saints,infantry,saloon,indoors,individuals,salute
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
144,0.000192,0.0,0.000192,0.0,0.0,0.0,0.000192,0.000192,0.000384,0.0,...,0.0,0.000473,0.000947,0.0,0.000473,0.0,0.000947,0.00071,0.000473,0.0
805,0.00034,0.001021,0.0,0.0,0.00034,0.00034,0.0,0.000681,0.00034,0.000681,...,0.00084,0.0,0.0,0.00042,0.0021,0.00126,0.0,0.00042,0.00084,0.0
1245,0.0,0.000343,0.000686,0.000686,0.0,0.000343,0.000343,0.000171,0.0,0.0,...,0.000211,0.0,0.000211,0.0,0.0,0.0,0.0,0.000423,0.00148,0.000423
4368,0.000509,0.000509,0.0,0.0,0.000509,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002511,0.0,0.0,0.0,0.0
5670,0.0,0.001055,0.0,0.000703,0.0,0.0,0.000352,0.000352,0.001406,0.0,...,0.0,0.0,0.002602,0.0,0.0,0.0,0.0,0.002602,0.000434,0.000434
6695,0.00121,0.0,0.0,0.000605,0.000303,0.000908,0.0,0.000303,0.0,0.000303,...,0.000746,0.0,0.0,0.0,0.000373,0.000373,0.0,0.0,0.0,0.0
29220,0.0,0.0,0.001065,0.001065,0.0,0.001065,0.0,0.0,0.001065,0.0,...,0.0,0.003942,0.002628,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61085,0.0,0.0,0.0,0.0,0.0,0.0,0.00458,0.0,0.0,0.0,...,0.0,0.0,0.0,0.005649,0.0,0.0,0.0,0.0,0.0,0.0
63022,0.0,0.003314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
DOC = pd.DataFrame(index = TFIDF_TABLE.index)
DOC = DOC.merge(LIB, left_on='book_id', right_index=True)
DOC = DOC[['author', 'title']]
DOC

Unnamed: 0_level_0,author,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
144,"VIRGINIA, WOOLF",THE VOYAGE OUT
805,"F.SCOTT, FITZGERALD",THIS SIDE OF PARADISE
1245,"VIRGINIA, WOOLF",NIGHT AND DAY
4368,"F.SCOTT, FITZGERALD",FLAPPERS AND PHILOSOPHERS
5670,"VIRGINIA, WOOLF",JACOBS ROOM
6695,"F.SCOTT, FITZGERALD",TALES OF THE JAZZ AGE
29220,"VIRGINIA, WOOLF",MONDAY OR TUESDAY
61085,"ERNEST, HEMINGWAY",IN OUR TIME
63022,"VIRGINIA, WOOLF",MR BENNETT AND MRS BROWN
63107,"VIRGINIA, WOOLF",MRS DALLOWAY IN BOND STREET


## Modeling 

### PCA

In [38]:
def PCA(matrix, k, norm_docs, center_by_mean, center_by_variance):
    '''
    This functions helps with computing the principle components for a corpus of texts.    
    '''
    
    if norm_docs == True:
        matrix = (matrix.T / norm(matrix, 2, axis=1)).T
    
    if center_by_mean == True:
        matrix = matrix - matrix.mean()
    
    if center_by_variance == True:
        matrix = matrix / matrix.std()
    
    COV = matrix.T.dot(matrix) / (matrix.shape[0] - 1)
    
    eig_vals, eig_vecs = eigh(COV)
    
    EIG_VEC = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VAL = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_VAL.index.name = 'term_str'
    EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
    EIG_PAIRS.sort_values('eig_val', ascending=False).head(k)
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
    
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'
    
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'
    
    DCM = matrix.dot(COMPS[COV.index].T)
    DCM = pd.concat([DCM, DOC], axis=1)
    DCM['doc'] = DCM.apply(lambda x: f"{x.title} CH.{str(x.name[1]).zfill(2)}", 1)
    
    
    return LOADINGS, DCM, COMPS

In [None]:
loadings, dcm, comps = PCA(TFIDF_TABLE, 10, norm_docs=True, center_by_mean=False, center_by_variance=False)

In [None]:
def vis_pcs(M, a, b, label='author', hover_name='doc', symbol=None, size=None):
    fig = px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_name=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)
    fig.show()
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = loadings.join(VOCAB)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', height=800)

In [None]:
vis_pcs(dcm, 0, 1, label="author_id")

In [None]:
vis_loadings(1,2)

### LDA

In [None]:
class topic_model:
    def __init__(self, corpus, bag):
    
        DOCS = corpus[corpus.pos.str.match(r'^NNS?$')]\
        .groupby(bag).term_str\
        .apply(lambda x: ' '.join(x))\
        .to_frame()\
        .rename(columns={'term_str':'doc_str'})
        
        count_engine = CountVectorizer(max_features=n_terms, ngram_range=ngram_range, stop_words='english')
        count_model = count_engine.fit_transform(DOCS.doc_str)
        TERMS = count_engine.get_feature_names_out()
        
        DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
        
        VOCAB = pd.DataFrame(index=TERMS)
        VOCAB.index.name = 'term_str'
        
        VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
        DOCS['term_count'] = DTM.sum(1)
        
        lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
        lda_model = lda_engine.fit_transform(count_model)

        TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        THETA = pd.DataFrame(lda_model, index=DOCS.index)
        THETA.columns.name = 'topic_id'
        THETA.columns = TNAMES

        self.theta = THETA
        
        PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
        PHI.index.name = 'topic_id'
        PHI.columns.name  = 'term_str'
        
        self.phi = PHI

        TOPICS = PHI.stack().to_frame('topic_weight').groupby('topic_id')\
        .apply(lambda x: x.sort_values('topic_weight', ascending=False)\
        .head(n_top_terms).reset_index().drop('topic_id', axis=1)['term_str'])
        
        TOPICS['label'] = TOPICS.apply(lambda x: x.name + ' ' + ', '.join(x[:n_top_terms]), 1)
        TOPICS['doc_weight_sum'] = THETA.sum()
        TOPICS['term_freq'] = PHI.sum(1) / PHI.sum(1).sum()
        
        TOPICS[AUTHORS] = THETA.join(LIB, on='book_id').groupby('author_key')[TNAMES].mean().T
        
        self.topics = TOPICS

#### LDA with Book as Bag

In [None]:
book_model = topic_model(corpus=CORPUS, bag=BOOKS)

In [None]:
book_model.theta

In [None]:
book_model.phi

In [None]:
book_model.topics

In [None]:
#book_model.topics[['Detective','Gothic','label']].style.background_gradient(cmap=colors, axis=None)

#### LDA with Chapters as Bag

In [None]:
chap_model = topic_model(corpus=CORPUS, bag=CHAPS)

In [None]:
chap_model.theta

In [None]:
chap_model.phi

In [None]:
chap_model.topics

In [None]:
#chap_model.topics[['Detective','Gothic','label']].style.background_gradient(cmap=colors, axis=None)

### word2vec

In [None]:
w2v_params = dict(
    window = 2,
    vector_size = 256,
    min_count = 50,
    workers = 4
)

In [None]:
w2v_params_2 = dict(
    window = 2,
    vector_size = 256,
    min_count = 80,
    workers = 4
)

In [None]:
class wordvec:
    def __init__(self, TOKENS, VOCAB, w2v_params):
        DOCS = TOKENS[TOKENS.pos.isin(['NN','NNS','VB','VBD','VBG', 'VBN','VBP','VBZ'])]\
        .groupby(BAG)\
        .term_str.apply(lambda  x:  x.tolist())\
        .reset_index()['term_str'].tolist()
        
        DOCS = [doc for doc in DOCS if len(doc) > 1]
        
        self.docs = DOCS
        
        if w2v_params == 'austen':
            model = word2vec.Word2Vec(DOCS, **austen_w2v_params)
        if w2v_params == 'melville':
            model = word2vec.Word2Vec(DOCS, **melville_w2v_params)
        
        self.model = model
        
        coords = pd.DataFrame(
            dict(
                vector = [model.wv.get_vector(w) for w in model.wv.key_to_index], 
                term_str = model.wv.key_to_index.keys()
            )).set_index('term_str')
        
        tsne_engine = TSNE(perplexity=20, learning_rate=200, init='random', n_iter=1000, random_state=42)
        tsne_model = tsne_engine.fit_transform(coords.vector.to_list())
        
        coords['x'] = tsne_model[:,0]
        coords['y'] = tsne_model[:,1]
        
        if coords.shape[1] == 3:
            coords = coords.merge(VOCAB.reset_index(), on='term_str')
            coords = coords.set_index('term_str')
        
        self.coords = coords

In [None]:
#austen = wordvec(AUSTEN_CORPUS, AUSTEN_VOCAB, 'austen')

In [None]:
px.scatter(austen.coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_group', 
           hover_name='term_str',          
           size='n',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

In [None]:
#melville = wordvec(MELVILLE_CORPUS, MELVILLE_VOCAB, 'melville')

In [None]:
px.scatter(melville.coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_group', 
           hover_name='term_str',          
           size='n',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

#### Analogies

In [None]:
def _complete_analogy(A, B, C, n=2):
    try:
        cols = ['term', 'sim']
        return pd.DataFrame(.model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
    except KeyError as e:
        print('Error:', e)
        return None
    
def _get_most_similar(positive, negative=None):
    return pd.DataFrame(.model.wv.most_similar(positive, negative), columns=['term', 'sim'])

In [None]:
_complete_analogy('man', 'give', 'woman', 3)

In [None]:
def _complete_analogy(A, B, C, n=2):
    try:
        cols = ['term', 'sim']
        return pd.DataFrame(.model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
    except KeyError as e:
        print('Error:', e)
        return None
    
def _get_most_similar(positive, negative=None):
    return pd.DataFrame(.model.wv.most_similar(positive, negative), columns=['term', 'sim'])

In [None]:
_complete_analogy('man', 'give', 'woman', 3)