In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os
import re
import nltk

In [2]:
OHCO = ['artist', 'title', 'verse_num', 'line_num', 'token_num']
songs = [song for song in sorted(glob('data/*.txt'))]

## Pipeline Functions

In [279]:
def buildLIBDOC(songs, OHCO=OHCO):
    lib = []
    doc = []
    
    for song in songs:
        
        # get artist and title
        artist, title = song[5:-4].split('_')
        
        # import song  lyrics
        with open(song, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            df = pd.DataFrame(lines, columns=['line'])

            # assign verse numbers
            verse_stop = df.line.str.match('\n| ')
            verse_num = [i+1 for i in range(df.loc[verse_stop].shape[0])]
            df.loc[verse_stop, 'verse_num'] = verse_num
            try:
                df.verse_num = df.verse_num.bfill().fillna(verse_num[-1:][0]+1)
            except:
                df.verse_num = 1
            df = df.replace('\n','')
            df = df.loc[-(df.line == '')]
            df['line'] = df['line'].str.strip()

            df['line'] = df['line'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))

            # group together, assign new index
            df = df.groupby(OHCO[2:3]).line.apply(lambda x: '\n'.join(x)).to_frame()
            df['title'] = title  
            df['artist'] = artist  
            df = df.reset_index().set_index(OHCO[:3]).rename(columns={'line':'verse'})

            lib.append((artist, title, song))
            doc.append(df)
        
    DOC = pd.concat(doc)
    LIB = pd.DataFrame(lib, columns=['artist', 'title', 'song_file']).set_index('artist')
    return LIB, DOC

def buildTOKEN(doc, OHCO=OHCO):
    
    # Convert verses to lines
    df = doc.verse.apply(lambda x: pd.Series(x.split('\n'))).stack().to_frame()
    df = df.rename(columns={0:'line'})
    
    # Convert sentences to tokens
    df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))).stack().to_frame()
    df = df.rename(columns={0:'temp'})
    df['token'] = df['temp'].apply(lambda x: x[0])
    df['pos'] = df['temp'].apply(lambda x: x[1])
    
    df.index.names = OHCO
    
    return df

def buildVOCAB(token):
    
    # get count of each token
    df = token['token'].value_counts().to_frame().rename(columns={'index':'token', 'token':'count'})
    
    # sort tokens by alphabetic order
    df = df.sort_index().reset_index().rename(columns={'index':'token'})
    
    # assign alphabetic order as ID
    df.index.name = 'token_id'
    
    # identify numbers
    df['num'] = df['token'].str.match("\d+").astype('int')
    
    # add stop word flag
    stopwords = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['token']) \
        .set_index('token').rename(columns={'index':'temp'})
    stopwords['temp'] = 1
    df['stop'] = df['token'].map(stopwords['temp']).fillna(0).astype('int')

    # add token stems
    stemmer = nltk.stem.porter.PorterStemmer()
    df['p_stem'] = df['token'].apply(stemmer.stem)
    
    return df

def updateTOKENVOCAB(token, vocab):
    
    # add token id to TOKEN
    token['token_id'] = token['token'].map(vocab.reset_index().set_index('token')['token_id'])
    
    # add most frequent POS to VOCAB
    vocab['pos_max'] = token.groupby(['token_id', 'pos'])['pos'].count().unstack().idxmax(1)
    return token, vocab    

def buildZIPF(vocab):
    
    # add count rank
    vocab = vocab.sort_values('count', ascending=False).reset_index()
    vocab['count_rank'] = vocab.reset_index().index + 1
    vocab = vocab.set_index('token_id')
    
    # add capped count rank
    alt_rank = vocab['count'].value_counts().sort_index(ascending=False) \
                .reset_index().reset_index() \
                .rename(columns={'level_0':'alt_rank', 'index':'count', 'count':'nn'}) \
                .set_index('count')
    vocab['alt_rank'] = vocab['count'].map(alt_rank['alt_rank']) + 1
    
    # straight probability of seeing the token
    vocab['p'] = vocab['count'] / vocab.shape[0]
    
    # marginal probability of seeing the token
    vocab['p2'] = vocab['count'] / vocab['count'].sum()

    # calculate Zipf values
    vocab['zipf_k'] = vocab['count'] * vocab['count_rank']
    vocab['zipf_k2'] = vocab['count'] * vocab['alt_rank']
    vocab['zipf_k3'] = vocab['p'] * vocab['alt_rank']
    
    # assign the self entropy of each token
    vocab['h'] = vocab['p2'] * np.log2(1/vocab['p2'])
    
    return vocab

def buildTFIDF(token, vocab, bag, count_type, term_freq_type, inverse_doc_type):
    bag = bag
    count_method = count_type
    tf_method = term_freq_type
    idf_method = inverse_doc_type

    # generate bag of words at set bag level
    BOW = token.groupby(bag+['token_id'])['token_id'].count().to_frame().rename(columns={'token_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    
    # create document term count matrix
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')
    
    # compute TF and transpose
    if tf_method == 'sum': TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max': TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log': TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw': TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = 0.5 + (1 - 0.5) * TF[TF > 0]
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
        
    # compute DF and IDF
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]
    
    # idf method selection
    if idf_method == 'standard': IDF = np.log10(N / DF)
    elif idf_method == 'max': IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth': IDF = np.log10((1 + N) / (1 + DF)) + 1
    
    # compute word context matrix entropy
    WCM = DTCM / DTCM.sum()
    WCMh = WCM * np.log2(1/WCM)
    
    # compute TFIDF
    TFIDF = TF * IDF
    
    # assign values to the VOCAB df
    vocab['h2'] = WCMh.sum()
    vocab['DF'] = DF
    vocab['IDF'] = IDF
    vocab['TFIDF_sum'] = TFIDF.sum()

    return TFIDF, vocab

## Pipeline

In [289]:
%%time
LIB, DOC = buildLIBDOC(songs)
TOKEN = buildTOKEN(DOC)
VOCAB = buildVOCAB(TOKEN)
TOKEN, VOCAB = updateTOKENVOCAB(TOKEN, VOCAB)
VOCAB = buildZIPF(VOCAB)
TFIDF, VOCAB = buildTFIDF(TOKEN, VOCAB, OHCO[:2], 'n', 'sum', 'standard')

  df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))).stack().to_frame()


Wall time: 3min 16s
