# Creating LIB, CORPUS, and VOCAB tables

Charlie Perez (cwp5xyj)

A note that the process of clipping the text files and creating the LIB table will be largely done by hand - I really want to preserve things like the Table of Contents for later work (given Martin's writing style, the exact POV of the chapter is important).

In [1]:
import numpy as np
import pandas as pd

from glob import glob
import re
import nltk
import plotly_express as px
import configparser

import os

In [2]:
from collections import defaultdict

In [3]:
# Think I will have to remove Fire and Blood - it wouldn't match well anyways, and would be much more of a novelty

metadata = [
    {
        'ID': 1,
        'file': 'agot.txt',
        'title': 'A Game of Thrones',
        'clip_range': (212, 14145),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (20, 165)
    },
    {
        'ID': 2,
        'file': 'acok.txt',
        'title': 'A Clash of Kings',
        'clip_range': (382, 16150),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (55, 334)
    },
    {
        'ID': 3,
        'file': 'asos.txt',
        'title': 'A Storm of Swords',
        'clip_range': (451, 20253),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (59, 386)
    },
    {
        'ID': 4,
        'file': 'affc.txt',
        'title': 'A Feast for Crows',
        'clip_range': (343, 13963),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (87, 271)
    },
    {
        'ID': 5,
        'file': 'adwd.txt',
        'title': 'A Dance with Dragons',
        'clip_range': (344, 18874),
        'chap_regex': r'^[A-Z ]+$',
        'TOC_range': (104, 250)
    }
]

#### Create LIB table

With formatted table of contents

In [4]:
data_dir = f'{os.getcwd()}/data'
data_dir

'/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data'

In [5]:
LIB = pd.DataFrame(metadata)
LIB.file = data_dir + '/' + LIB.file
LIB.set_index('ID', inplace=True)
LIB

Unnamed: 0_level_0,file,title,clip_range,chap_regex,TOC_range
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Game of Thrones,"(212, 14145)",^[A-Z ]+$,"(20, 165)"
2,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Clash of Kings,"(382, 16150)",^[A-Z ]+$,"(55, 334)"
3,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Storm of Swords,"(451, 20253)",^[A-Z ]+$,"(59, 386)"
4,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Feast for Crows,"(343, 13963)",^[A-Z ]+$,"(87, 271)"
5,/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/D...,A Dance with Dragons,"(344, 18874)",^[A-Z ]+$,"(104, 250)"


In [6]:
for idx, row in LIB.iterrows():
    src, clip_range, chap_ragex, TOC_range = row.file, row.clip_range, row.chap_regex, row.TOC_range
    print(src, clip_range, chap_ragex, TOC_range)

/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/agot.txt (212, 14145) ^[A-Z ]+$ (20, 165)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/acok.txt (382, 16150) ^[A-Z ]+$ (55, 334)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/asos.txt (451, 20253) ^[A-Z ]+$ (59, 386)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/affc.txt (343, 13963) ^[A-Z ]+$ (87, 271)
/sfs/gpfs/tardis/home/cwp5xyj/Documents/MSDS/DS5001/final-project/data/adwd.txt (344, 18874) ^[A-Z ]+$ (104, 250)


In [8]:
def read_lines(src):
    text_lines = open(src,'r', encoding= 'utf-8').readlines()
    LINES = pd.DataFrame({'line_str':text_lines})
    LINES.index.name = 'line_id'
    return LINES

def extract_TOC(LINES, TOC_range):
    TOC_lines = LINES.loc[TOC_range[0]:TOC_range[1]].copy()
    contents = []
    chapter_counts = defaultdict(int)
    for i in TOC_lines.line_str.str.strip('\n'):
        if i != '':
            if i != 'Prologue' and i != 'Epilogue':
                chapter_counts[i] += 1
                contents.append(f'{i} {chapter_counts[i]}')
            else:
                contents.append(i)
    return contents

def parse_tokens(LINES, clip_range, chap_regex, book_id):
    LINES = LINES.loc[clip_range[0]:clip_range[1]].copy()
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
    
    LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()
    chap_lines = LINES.line_str.str.match(chap_regex, case=False)

    LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
    LINES.chap_num = LINES.chap_num.ffill()

    LINES = LINES.dropna(subset=['chap_num'])
    LINES = LINES.loc[~chap_lines]
    LINES.chap_num = LINES.chap_num.astype('int')

    CHAPS = LINES.groupby(OHCO[:1])\
        .line_str.apply(lambda x: '\n'.join(x))\
        .to_frame('chap_str')

    CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

    para_pat = r'\n\n+'

    PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
        .to_frame('para_str').sort_index()
    PARAS.index.names = OHCO[:2]

    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
    PARAS['para_str'] = PARAS['para_str'].str.strip()
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

    SENTS = PARAS.para_str\
                    .apply(lambda x: pd.Series(nltk.sent_tokenize(x), dtype='string'))\
                    .stack()\
                    .to_frame('sent_str')

    SENTS.index.names = OHCO[:3]

    TOKENS = SENTS.sent_str\
                    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))
    TOKENS = TOKENS.stack().to_frame('pos_tuple')
    TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
    TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
    TOKENS['term_str'] = TOKENS.token_str.str.lower()

    TOKENS.index.names = OHCO[:4]
    
    TOKENS.reset_index(inplace=True)
    TOKENS['book_id'] = book_id
    OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
    TOKENS.set_index(OHCO, inplace=True)
    
    punc_pos = ['$', "''", '(', ')', ',', '--', '.', ':', '``']
    TOKENS['term_str'] = TOKENS[~TOKENS.pos.isin(punc_pos)].token_str\
                        .str.replace(r'[\W_]+', '', regex=True).str.lower()  
    return TOKENS
    
    
def parse_corpus(LIB):
    contents = []
    TOKENS = pd.DataFrame()
    for idx, row in LIB.iterrows():
        book_id, src, clip_range, chap_regex, TOC_range = idx, row.file, row.clip_range, row.chap_regex, row.TOC_range
        LINES = read_lines(src)
        ind_contents = extract_TOC(LINES, TOC_range)
        ind_TOKENS = parse_tokens(LINES, clip_range, chap_regex, book_id)
        
        contents.append(ind_contents)
        TOKENS = pd.concat([TOKENS, ind_TOKENS])
    
    return contents, TOKENS

In [9]:
import time

In [10]:
%%time

contents, TOKENS = parse_corpus(LIB)

CPU times: user 1min 44s, sys: 1.24 s, total: 1min 45s
Wall time: 1min 45s


In [11]:
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,"(“, IN)",IN,“,
1,1,0,0,1,"(We, PRP)",PRP,We,we
1,1,0,0,2,"(should, MD)",MD,should,should
1,1,0,0,3,"(start, VB)",VB,start,start
1,1,0,0,4,"(back, RB)",RB,back,back


In [12]:
TOKENS.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,70,140,0,3,"(hands, NNS)",NNS,hands,hands
5,70,140,0,4,"(,, ,)",",",",",
5,70,140,0,5,"(the, DT)",DT,the,the
5,70,140,0,6,"(daggers, NNS)",NNS,daggers,daggers
5,70,140,0,7,"(., .)",.,.,


In [13]:
print(contents)

[['Prologue', 'Bran 1', 'Catelyn 1', 'Daenerys 1', 'Eddard 1', 'Jon 1', 'Catelyn 2', 'Arya 1', 'Bran 2', 'Tyrion 1', 'Jon 2', 'Daenerys 2', 'Eddard 2', 'Tyrion 2', 'Catelyn 3', 'Sansa 1', 'Eddard 3', 'Bran 3', 'Catelyn 4', 'Jon 3', 'Eddard 4', 'Tyrion 3', 'Arya 2', 'Daenerys 3', 'Bran 4', 'Eddard 5', 'Jon 4', 'Eddard 6', 'Catelyn 5', 'Sansa 2', 'Eddard 7', 'Tyrion 4', 'Arya 3', 'Eddard 8', 'Catelyn 6', 'Eddard 9', 'Daenerys 4', 'Bran 5', 'Tyrion 5', 'Eddard 10', 'Catelyn 7', 'Jon 5', 'Tyrion 6', 'Eddard 11', 'Sansa 3', 'Eddard 12', 'Daenerys 5', 'Eddard 13', 'Jon 6', 'Eddard 14', 'Arya 4', 'Sansa 4', 'Jon 7', 'Bran 6', 'Daenerys 6', 'Catelyn 8', 'Tyrion 7', 'Sansa 5', 'Eddard 15', 'Catelyn 9', 'Jon 8', 'Daenerys 7', 'Tyrion 8', 'Catelyn 10', 'Daenerys 8', 'Arya 5', 'Bran 7', 'Sansa 6', 'Daenerys 9', 'Tyrion 9', 'Jon 9', 'Catelyn 11', 'Daenerys 10'], ['PROLOGUE 1', 'ARYA 1', 'SANSA 1', 'TYRION 1', 'BRAN 1', 'ARYA 2', 'JON 1', 'CATELYN 1', 'TYRION 2', 'ARYA 3', 'DAVOS 1', 'THEON 1', 'DAE

Need to manually mess with the AFFC and ADWD chapters - gonna be a bit of a pain

Session died. Tears. Not gonna reload everything right now.

#### Create Vocabulary from TOKENS table

In [17]:
CORPUS = TOKENS # forgot it was supposed to be called CORPUS
CORPUS = CORPUS[CORPUS.term_str != ''].copy() # get rid of random punctuation
CORPUS['pos_group'] = CORPUS.pos.str[:2] # also didn't realize this was needed too

VOCAB = CORPUS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['p'] = VOCAB['n'] / VOCAB['n'].sum()
VOCAB['s'] = 1 / VOCAB['p']
VOCAB['i'] = np.log2(VOCAB['s']) 
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [59]:
# add features
LIB['date'] = [1996, 1998, 2000, 2005, 2011]
LIB['chap_labels'] = contents

I understand that I'm short some metadata (I contemplated branching out into other book series, or a different type of ASOIAF books). But I like what I have going on here and am disinclined to change it.

In [60]:
# save these two as they likely won't be changed again

LIB.to_csv('output/LIB.csv', sep='|')
CORPUS.to_csv('output/CORPUS.csv', sep='|')

#### Getting into BOW, TFIDF, and DFIDF

No point keeping these separate. Need DFIDF in the VOCAB table, so may as well do it here.

Chapter is the unit of observation most interesting to me, so that's what we're gonna go with here. TF method is 'sum' and IDF method is 'standard'.

In [19]:
bag = ['book_id', 'chap_num']
BOW = CORPUS.groupby(bag+['term_str']).term_str.count().to_frame('n')
DTCM = BOW.n.unstack(fill_value=0)
N = DTCM.shape[0]
TF = (DTCM.T / DTCM.T.sum()).T
DF = DTCM.astype('bool').sum()
IDF = np.log2(N / DF)
TFIDF = TF * IDF

VOCAB['df'] = DF
VOCAB['dfidf'] = VOCAB.df * np.log2(len(TFIDF)/VOCAB.df)

In [22]:
VOCAB.sort_values(by='dfidf', ascending=False).head(20)

Unnamed: 0_level_0,n,p,s,i,max_pos,max_pos_group,df,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
leaves,244,0.000138,7233.221311,12.820423,NNS,NN,126,181.512147
leaving,167,9.5e-05,10568.299401,13.367456,VBG,VB,126,181.512147
maybe,291,0.000165,6064.969072,12.566285,RB,RB,126,181.512147
stupid,253,0.000143,6975.913043,12.768166,JJ,JJ,126,181.512147
peace,272,0.000154,6488.625,12.663697,NN,NN,126,181.512147
laughing,183,0.000104,9644.295082,13.23546,VBG,VB,126,181.512147
woke,199,0.000113,8868.874372,13.114535,VBD,VB,126,181.512147
fist,231,0.000131,7640.285714,12.899411,NN,NN,126,181.512147
doors,235,0.000133,7510.238298,12.874643,NNS,NN,126,181.512147
sorry,189,0.000107,9338.126984,13.188917,JJ,JJ,125,181.508529


In [23]:
# now for stemming and stopwords

sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [25]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)
VOCAB.head()

Unnamed: 0_level_0,n,p,s,i,max_pos,max_pos_group,df,dfidf,stop,porter_stem
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
the,101702,0.057625,17.3537,4.117171,DT,DT,342,0.0,1,the
and,51224,0.029024,34.45467,5.106628,CC,CC,342,0.0,1,and
to,39497,0.022379,44.684558,5.481704,TO,TO,342,0.0,1,to
a,39269,0.02225,44.944002,5.490057,DT,DT,342,0.0,1,a
of,35195,0.019942,50.146498,5.648077,IN,IN,342,0.0,1,of


In [55]:
VOCAB.to_csv('output/VOCAB.csv', sep='|')

In [29]:
# taking a look at the BOW, DCTM, and TFIDF matrices

BOW['tfidf'] = TFIDF.stack()

BOW.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tfidf
book_id,chap_num,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,a,111,0.0
1,1,abandoned,2,0.001167
1,1,about,4,5.8e-05
1,1,above,1,0.00012
1,1,accustomed,1,0.000876
1,1,acquiescence,1,0.002186
1,1,across,1,2.6e-05
1,1,adjusted,1,0.001407
1,1,admitted,1,0.000464
1,1,aemon,2,0.001256


In [41]:
DTCM.head()

Unnamed: 0_level_0,term_str,10th,15th,16th,23rd,57th,61st,a,aaaaaaarrreeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,aaaaaaoooooooooooooooooooooooo,aaaaahoooooooooooooooooooo,...,zekko,zenith,zhak,zharaq,zigged,zo,zollo,zorse,zorses,zzzs
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,111,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,63,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,41,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,78,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,86,0,0,0,...,0,0,0,0,0,0,0,0,0,0


It may look like there's an issue with the source material, but there are actually just several different horn sounds in one particular chapter in *A Feast for Crows*.

In [45]:
TFIDF.head()

Unnamed: 0_level_0,term_str,10th,15th,16th,23rd,57th,61st,a,aaaaaaarrreeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,aaaaaaoooooooooooooooooooooooo,aaaaahoooooooooooooooooooo,...,zekko,zenith,zhak,zharaq,zigged,zo,zollo,zorse,zorses,zzzs
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So I am guessing that "reduced and normalized TFIDF_L2" means I have to reduce it to some specific feature size, probably by DFIDF.

I think I want to take out proper nouns (names) but probably keep verbs and adjectives? Is 5000 too many?

In [50]:
VSHORT = VOCAB[VOCAB.max_pos_group.isin(['NN', 'VB', 'JJ']) & ~VOCAB.max_pos.isin(['NNP'])].sort_values('dfidf', ascending=False).head(5000)

TFIDF_5000 = TFIDF[VSHORT.index]

In [47]:
from scipy.linalg import norm

In [51]:
# create TFIDF_L2

TFIDF_L2 = (TFIDF_5000.T / norm(TFIDF_5000, 2, axis=1)).T

a = len(TFIDF_L2)
TFIDF_L2 = TFIDF_L2.dropna()
b = len(TFIDF_L2)
bag_loss = a - b
bag_loss

0

In [52]:
TFIDF_L2

Unnamed: 0_level_0,term_str,woke,peace,leaves,stupid,laughing,doors,fist,leaving,recall,ate,...,cogs,washerwomen,shortsword,droll,reigned,wanton,dragonbone,flanks,fords,lair
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.000000,0.000000,0.043317,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
1,2,0.000000,0.000000,0.000000,0.000000,0.033562,0.000000,0.000000,0.000000,0.016915,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
1,3,0.000000,0.000000,0.025442,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
1,4,0.015683,0.000000,0.031367,0.000000,0.000000,0.031367,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.102715,0.000000,0.0,0.000000
1,5,0.000000,0.019001,0.000000,0.000000,0.000000,0.000000,0.000000,0.019001,0.019153,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,66,0.000000,0.020986,0.000000,0.010493,0.000000,0.031479,0.000000,0.000000,0.010577,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
5,67,0.000000,0.000000,0.000000,0.000000,0.000000,0.100146,0.000000,0.000000,0.000000,0.010095,...,0.0,0.0,0.032795,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
5,68,0.000000,0.044976,0.000000,0.000000,0.014992,0.022488,0.029984,0.000000,0.007556,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.049094,0.0,0.049094
5,69,0.034008,0.008502,0.008502,0.008502,0.008502,0.000000,0.017004,0.000000,0.017140,0.025710,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.055683


In [54]:
BOW.to_csv('output/BOW.csv', sep='|')
DTCM.to_csv('output/DTM.csv', sep='|')
TFIDF.to_csv('output/TFIDF.csv', sep='|')
TFIDF_L2.to_csv('output/TFIDF_L2.csv', sep='|')