# Creating LIB, CORPUS, and VOCAB tables

Charlie Perez (cwp5xyj)

A note that the process of clipping the text files and creating the LIB table will be largely done by hand - I really want to preserve things like the Table of Contents for later work (given Martin's writing style, the exact POV of the chapter is important).

In [3]:
import numpy as np
import pandas as pd

from glob import glob
import re
import nltk
import plotly_express as px
import configparser

import os

In [53]:
from collections import defaultdict

In [36]:
# Think I will have to remove Fire and Blood - it wouldn't match well anyways, and would be much more of a novelty

metadata = [
    {
        'ID': 1,
        'file': 'agot.txt',
        'title': 'A Game of Thrones',
        'clip_range': (212, 14145),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (20, 165)
    },
    {
        'ID': 2,
        'file': 'acok.txt',
        'title': 'A Clash of Kings',
        'clip_range': (382, 16150),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (55, 334)
    },
    {
        'ID': 3,
        'file': 'asos.txt',
        'title': 'A Storm of Swords',
        'clip_range': (451, 20253),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (59, 386)
    },
    {
        'ID': 4,
        'file': 'affc.txt',
        'title': 'A Feast for Crows',
        'clip_range': (343, 13963),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (87, 271)
    },
    {
        'ID': 5,
        'file': 'adwd.txt',
        'title': 'A Dance with Dragons',
        'clip_range': (344, 18874),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (104, 250)
    }
]

#### Create LIB table

With formatted table of contents

In [8]:
data_dir = f'{os.getcwd()}/data'
data_dir

'/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data'

In [37]:
LIB = pd.DataFrame(metadata)
LIB.file = data_dir + '/' + LIB.file
LIB.set_index('ID', inplace=True)
LIB

Unnamed: 0_level_0,file,title,clip_range,chap_regex,TOC_range
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Game of Thrones,"(212, 14145)",^[A-Z ]+$,"(20, 165)"
2,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Clash of Kings,"(382, 16150)",^[A-Z ]+$,"(55, 334)"
3,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Storm of Swords,"(451, 20253)",^[A-Z ]+$,"(59, 386)"
4,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Feast for Crows,"(343, 13963)",^[A-Z ]+$,"(87, 271)"
5,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Dance with Dragons,"(344, 18874)",^[A-Z ]+$,"(104, 250)"


In [63]:
for idx, row in LIB.iterrows():
    src, clip_range, chap_ragex, TOC_range = row.file, row.clip_range, row.chap_regex, row.TOC_range
    print(src, clip_range, chap_ragex, TOC_range)

/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/agot.txt (212, 14145) ^[A-Z ]+$ (20, 165)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/acok.txt (382, 16150) ^[A-Z ]+$ (55, 334)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/asos.txt (451, 20253) ^[A-Z ]+$ (59, 386)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/affc.txt (343, 13963) ^[A-Z ]+$ (87, 271)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/adwd.txt (344, 18874) ^[A-Z ]+$ (104, 250)


In [88]:
def read_lines(src):
    text_lines = open(src,'r', encoding= 'utf-8').readlines()
    LINES = pd.DataFrame({'line_str':text_lines})
    LINES.index.name = 'line_id'
    return LINES

def extract_TOC(LINES, TOC_range):
    TOC_lines = LINES.loc[TOC_range[0]:TOC_range[1]].copy()
    contents = []
    chapter_counts = defaultdict(int)
    for i in TOC_lines.line_str.str.strip('\n'):
        if i != '':
            if i != 'Prologue' and i != 'Epilogue':
                chapter_counts[i] += 1
                contents.append(f'{i} {chapter_counts[i]}')
            else:
                contents.append(i)
    return contents

def parse_tokens(LINES, clip_range, chap_regex, book_id):
    LINES = LINES.loc[clip_range[0]:clip_range[1]].copy()
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
    
    LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()
    chap_lines = LINES.line_str.str.match(chap_regex, case=False)

    LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
    LINES.chap_num = LINES.chap_num.ffill()

    LINES = LINES.dropna(subset=['chap_num'])
    LINES = LINES.loc[~chap_lines]
    LINES.chap_num = LINES.chap_num.astype('int')

    CHAPS = LINES.groupby(OHCO[:1])\
        .line_str.apply(lambda x: '\n'.join(x))\
        .to_frame('chap_str')

    CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

    para_pat = r'\n\n+'

    PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
        .to_frame('para_str').sort_index()
    PARAS.index.names = OHCO[:2]

    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
    PARAS['para_str'] = PARAS['para_str'].str.strip()
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

    SENTS = PARAS.para_str\
                    .apply(lambda x: pd.Series(nltk.sent_tokenize(x), dtype='string'))\
                    .stack()\
                    .to_frame('sent_str')

    SENTS.index.names = OHCO[:3]

    TOKENS = SENTS.sent_str\
                    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))
    TOKENS = TOKENS.stack().to_frame('pos_tuple')
    TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
    TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
    TOKENS['term_str'] = TOKENS.token_str.str.lower()

    TOKENS.index.names = OHCO[:4]
    
    TOKENS.reset_index(inplace=True)
    TOKENS['book_id'] = book_id
    OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
    TOKENS.set_index(OHCO, inplace=True)
    
    punc_pos = ['$', "''", '(', ')', ',', '--', '.', ':', '``']
    TOKENS['term_str'] = TOKENS[~TOKENS.pos.isin(punc_pos)].token_str\
                        .str.replace(r'[\W_]+', '', regex=True).str.lower()  
    return TOKENS
    
    
def parse_corpus(LIB):
    contents = []
    TOKENS = pd.DataFrame()
    for idx, row in LIB.iterrows():
        book_id, src, clip_range, chap_regex, TOC_range = idx, row.file, row.clip_range, row.chap_regex, row.TOC_range
        LINES = read_lines(src)
        ind_contents = extract_TOC(LINES, TOC_range)
        ind_TOKENS = parse_tokens(LINES, clip_range, chap_regex, book_id)
        
        contents.append(ind_contents)
        TOKENS = pd.concat([TOKENS, ind_TOKENS])
    
    return contents, TOKENS

In [65]:
import time

In [89]:
%%time

contents, TOKENS = parse_corpus(LIB)

CPU times: user 1min 43s, sys: 649 ms, total: 1min 44s
Wall time: 1min 45s


In [90]:
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,"(“, IN)",IN,“,
1,1,0,0,1,"(We, PRP)",PRP,We,we
1,1,0,0,2,"(should, MD)",MD,should,should
1,1,0,0,3,"(start, VB)",VB,start,start
1,1,0,0,4,"(back, RB)",RB,back,back


In [91]:
TOKENS.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,70,140,0,3,"(hands, NNS)",NNS,hands,hands
5,70,140,0,4,"(,, ,)",",",",",
5,70,140,0,5,"(the, DT)",DT,the,the
5,70,140,0,6,"(daggers, NNS)",NNS,daggers,daggers
5,70,140,0,7,"(., .)",.,.,


In [1]:
print(contents)

NameError: name 'contents' is not defined

Need to manually mess with the AFFC and ADWD chapters - gonna be a bit of a pain

#### Create Vocabulary from TOKENS table

In [None]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB['n'] / VOCAB['n'].sum()
VOCAB['s'] = 1 / VOCAB['p']
VOCAB['i'] = np.log2(VOCAB['s']) 
VOCAB['h'] = VOCAB['p'] * VOCAB['i']

In [None]:
LIB.to_csv('data/LIB.csv', sep='|')
TOKENS.to_csv('data/TOKENS.csv', sep='|')
VOCAB.to_csv('data/VOCAB.csv', sep='|')