In [6]:
import pandas as pd
import numpy as np
from glob import glob
import os
import re
import nltk

In [352]:
OHCO = ['artist', 'title', 'verse_num', 'line_num', 'token_num']
songs = [song for song in sorted(glob('data/*.txt'))]

## Create LIB and DOC

In [391]:
def buildLIBDOC(songs, OHCO=OHCO):
    lib = []
    doc = []
    
    for song in songs:
        
        # get artist and title
        artist, title = song[5:-4].split('_')
        
        # import song  lyrics
        df = pd.DataFrame(open(song, 'r', encoding='utf-8').readlines(), columns=['line'])
        
        # assign verse numbers
        verse_stop = df.line.str.match('\n| ')
        verse_num = [i+1 for i in range(df.loc[verse_stop].shape[0])]
        df.loc[verse_stop, 'verse_num'] = verse_num
        try:
            df.verse_num = df.verse_num.bfill().fillna(verse_num[-1:][0]+1)
        except:
            df.verse_num = 1
        df = df.replace('\n','')
        df = df.loc[-(df.line == '')]
        df['line'] = df['line'].str.strip()
        
        df['line'] = df['line'].apply(lambda x: re.sub(r'[^A-Za-z0-9]+', '', x))

        # group together, assign new index
        df = df.groupby(OHCO[2:3]).line.apply(lambda x: '\n'.join(x)).to_frame()
        df['title'] = title  
        df['artist'] = artist  
        df = df.reset_index().set_index(OHCO[:3]).rename(columns={'line':'verse'})
        
        lib.append((artist, title, song))
        doc.append(df)
        
    DOC = pd.concat(doc)
    LIB = pd.DataFrame(lib, columns=['artist', 'title', 'song_file']).set_index('artist')
    return LIB, DOC

In [None]:
LIB, DOC = buildLIBDOC(songs)

## Create TOKEN

In [368]:
df = DOC.verse.apply(lambda x: pd.Series(x.split('\n'))).stack().to_frame().rename(columns={0:'line'})

In [370]:
df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))).stack().to_frame()

  df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))).stack().to_frame()


In [376]:
df = df.rename(columns={0:'temp'})

In [382]:
df.temp#.str[0]

artist    title         verse_num        
50 Cent   21 Questions  1.0        0   0      (New, NNP)
                                       1     (York, NNP)
                                       2     (City, NNP)
                                   1   0      (You, PRP)
                                       1      (are, VBP)
                                                ...     
Yung Joc  ’Bout It      7.0        23  8     ('bout, IN)
                                       9       (it, PRP)
                                       10         ((, ()
                                       11    (Woah, NNP)
                                       12         (), ))
Name: temp, Length: 2005906, dtype: object

In [None]:
def buildTOKEN(doc, OHCO=OHCO):
    
    # Convert verses to lines
    df = doc.verse.apply(lambda x: pd.Series(x.split('\n'))).stack().to_frame()
    df = df.rename(columns={0:'line'})
    
    # Convert sentences to tokens
    df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))).stack().to_frame()
    df = df.rename(columns={0:'temp'})
    df['pos'] = df['temp'].apply(lambda x: x[1])
    
    #

In [None]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    
    # Paragraphs to Sentences
    df = doc_df.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df