Christian Schroeder (dbn5eu@virginia.edu)

DS 5001: Exploratory Text Analytics

15 December 2021

# Build Tables

In [1]:
import pandas as pd
import numpy as np
import os, re, nltk



## Pipeline Functions

In [75]:
# index names for the data
OHCO = ['artist', 'song_id', 'verse_num', 'line_num', 'token_num']

# build the LIB and DOC tables
def buildLIBDOC(songs, OHCO=OHCO):
    
    lib = []
    doc = []
    
    for i, song in songs.iterrows():
        
        artist = song['artist']
        title = song['title']
        song_id = i
        
        verse_list = []
        verses = song['lyrics'].split('\n\n')
        for i in range(len(verses)):
            verse_list.append([artist, song_id, i+1, verses[i]])
        df = pd.DataFrame(verse_list, columns=['artist','song_id','verse_num','verse']).set_index(OHCO[:3])
            
        lib.append([artist, song_id, title])
        doc.append(df)
    
    DOC = pd.concat(doc)
    LIB = pd.DataFrame(lib, columns=['artist', 'song_id', 'title']).set_index('artist')
    return LIB, DOC
        
    DOC = pd.concat(doc)
    LIB = pd.DataFrame(lib, columns=['artist', 'song_id', 'title', 'song_file']).set_index('artist')
    return LIB, DOC

# build the TOKEN table
def buildTOKEN(doc, OHCO=OHCO):
    
    # Convert verses to lines
    df = doc.verse.apply(lambda x: pd.Series(x.split('\n'))).stack().to_frame()
    df = df.rename(columns={0:'line'})
    df['line'] = df['line'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
    
    # Convert sentences to tokens
    df = df.line.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))).stack().to_frame()
    df = df.rename(columns={0:'temp'})
    df['token'] = df['temp'].apply(lambda x: x[0])
    df['pos'] = df['temp'].apply(lambda x: x[1])
    
    df.index.names = OHCO
    
    return df

# build the VOCAB table
def buildVOCAB(token):
    
    # get count of each token
    df = token['token'].value_counts().to_frame().rename(columns={'index':'token', 'token':'count'})
    
    # sort tokens by alphabetic order
    df = df.sort_index().reset_index().rename(columns={'index':'token'})
    
    # assign alphabetic order as ID
    df.index.name = 'token_id'
    
    # identify numbers
    df['num'] = df['token'].str.match("\d+").astype('int')
    
    # add stop word flag
    stopwords = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['token']) \
        .set_index('token').rename(columns={'index':'temp'})
    stopwords['temp'] = 1
    df['stop'] = df['token'].map(stopwords['temp']).fillna(0).astype('int')

    # add token stems
    stemmer = nltk.stem.porter.PorterStemmer()
    df['p_stem'] = df['token'].apply(stemmer.stem)
    
    return df

# update TOKEN and VOCAB with the token ids and selected part of speech
def updateTOKENVOCAB(token, vocab):
    
    # add token id to TOKEN
    token['token_id'] = token['token'].map(vocab.reset_index().set_index('token')['token_id'])
    
    # add most frequent POS to VOCAB
    vocab['pos_max'] = token.groupby(['token_id', 'pos'])['pos'].count().unstack().idxmax(1)
    return token, vocab    

# calculate Zipf values and assign to the VOCAB table
def buildZIPF(vocab):
    
    # add count rank
    vocab = vocab.sort_values('count', ascending=False).reset_index()
    vocab['count_rank'] = vocab.reset_index().index + 1
    vocab = vocab.set_index('token_id')
    
    # add capped count rank
    alt_rank = vocab['count'].value_counts().sort_index(ascending=False) \
                .reset_index().reset_index() \
                .rename(columns={'level_0':'alt_rank', 'index':'count', 'count':'nn'}) \
                .set_index('count')
    vocab['alt_rank'] = vocab['count'].map(alt_rank['alt_rank']) + 1
    
    # straight probability of seeing the token
    vocab['p'] = vocab['count'] / vocab.shape[0]
    
    # marginal probability of seeing the token
    vocab['p2'] = vocab['count'] / vocab['count'].sum()

    # calculate Zipf values
    vocab['zipf_k'] = vocab['count'] * vocab['count_rank']
    vocab['zipf_k2'] = vocab['count'] * vocab['alt_rank']
    vocab['zipf_k3'] = vocab['p'] * vocab['alt_rank']
    
    # assign the self entropy of each token
    vocab['h'] = vocab['p2'] * np.log2(1/vocab['p2'])
    
    return vocab

# build the TFIDF table and update VOCAB
def buildTFIDF(token, vocab, bag, count_type, term_freq_type, inverse_doc_type):
    bag = bag
    count_method = count_type
    tf_method = term_freq_type
    idf_method = inverse_doc_type

    # generate bag of words at set bag level
    BOW = token.groupby(bag+['token_id'])['token_id'].count().to_frame().rename(columns={'token_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    
    # create document term count matrix
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')
    
    # compute TF and transpose
    if tf_method == 'sum': TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max': TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log': TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw': TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = 0.5 + (1 - 0.5) * TF[TF > 0]
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
        
    # compute DF and IDF
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]
    
    # idf method selection
    if idf_method == 'standard': IDF = np.log10(N / DF)
    elif idf_method == 'max': IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth': IDF = np.log10((1 + N) / (1 + DF)) + 1
    
    # compute word context matrix entropy
    WCM = DTCM / DTCM.sum()
    WCMh = WCM * np.log2(1/WCM)
    
    # compute TFIDF
    TFIDF = TF * IDF
    
    # assign values to the VOCAB df
    vocab['h2'] = WCMh.sum()
    vocab['DF'] = DF
    vocab['IDF'] = IDF
    vocab['TFIDF_sum'] = TFIDF.sum()

    return TFIDF, vocab

## Pipeline

In [76]:
# run all table building
def pipeline(create_new = True):
    
    if create_new:
        
        # run the pipeline
        LIB, DOC = buildLIBDOC(archive)
        TOKEN = buildTOKEN(DOC)
        VOCAB = buildVOCAB(TOKEN)
        TOKEN, VOCAB = updateTOKENVOCAB(TOKEN, VOCAB)
        VOCAB = buildZIPF(VOCAB)
        TFIDF, VOCAB = buildTFIDF(TOKEN, VOCAB, OHCO[:1], 'n', 'sum', 'standard')
        
        # export
        LIB.to_csv('data/tables/LIB.csv')
        DOC.to_csv('data/tables/DOC.csv')
        TOKEN[:int(TOKEN.shape[0]/2)].to_csv('data/tables/TOKEN1.csv')
        TOKEN[int(TOKEN.shape[0]/2):].to_csv('data/tables/TOKEN2.csv')
        VOCAB.to_csv('data/tables/VOCAB.csv')
        TFIDF.to_csv('data/tables/TFIDF.csv')
    
    else:
        
        # import previously created tables
        LIB = pd.read_csv('data/tables/LIB.csv')
        DOC = pd.read_csv('data/tables/DOC.csv').set_index(OHCO[:3])
        TOKEN = pd.concat([pd.read_csv('data/tables/TOKEN1.csv').set_index(OHCO),pd.read_csv('data/tables/TOKEN2.csv').set_index(OHCO)])
        VOCAB = pd.read_csv('data/tables/VOCAB.csv').set_index('token_id')
        TFIDF = pd.read_csv('data/tables/TFIDF.csv').set_index('artist')

In [None]:
# import song lyrics dataset
archive = pd.read_csv('data/songs_archive.csv', dtype={'lyrics':'string'}).drop('Unnamed: 0', axis=1)

# run the pipeline to build the tables
pipeline(False)

In [44]:
DOC.reset_index().artist.unique()

array(['Aesop Rock', 'Busdriver', 'Jedi Mind Tricks', 'GZA',
       'Wu-Tang Clan', 'MF DOOM', 'RZA', 'Immortal Technique', 'Canibus',
       'Ghostface Killah', 'Del The Funky Homosapien', 'The Roots',
       'Blackalicious', 'Jean Grae', 'Killah Priest', 'Kool Keith',
       'Kool G Rap', 'CunninLynguists', 'Sage Francis', 'Raekwon',
       'Watsky', 'Action Bronson', 'Redman', 'Das EFX', 'Common',
       'K.A.A.N.', 'E-40', 'Goodie Mob', 'Nas', 'Brother Ali',
       'Method Man', 'Flatbush Zombies', 'Joey Badass',
       'A Tribe Called Quest', 'Yasiin Bey', 'De La Soul', 'Xzibit',
       'Murs', 'Rittz', 'Atmosphere', 'Talib Kweli', 'Big Daddy Kane',
       'Lupe Fiasco', 'Cypress Hill', 'LL Cool J', 'Beastie Boys',
       'Fat Joe', 'K-Rino', 'Busta Rhymes', 'Gang Starr', 'Mac Dre',
       'Ludacris', 'KRS-One', 'OutKast', 'Brand Nubian', 'Ab-Soul',
       'Joe Budden', 'Twista', 'Eminem', 'Tyler, The Creator',
       'Denzel Curry', 'Biz Markie', 'AsAP Rocky', 'Tech N9ne',
      

In [60]:
artists = pd.read_csv('data/lyric_award_rankings.csv')

In [49]:
artists['artist2'] = pd.Series(DOC.reset_index().artist.unique())

In [67]:
test = artists.merge(pd.DataFrame(pd.Series(DOC.reset_index().artist.unique())).rename(columns={0:'artist'}), on='artist', how='outer')

In [74]:
test.loc[test.noms.isna()]

Unnamed: 0,artist,lyrical_rank,recalc,wins,noms,win_rate
160,Yasiin Bey,,,,,
161,"Tyler, The Creator",,,,,
162,Cam’ron,,,,,
163,Lil’ Kim,,,,,
164,​cupcakKe,,,,,
165,Lil Peep,,,,,


Unnamed: 0,artist
0,Aesop Rock
1,Busdriver
2,Jedi Mind Tricks
3,GZA
4,Wu-Tang Clan
...,...
153,A Boogie wit da Hoodie
154,YoungBoy Never Broke Again
155,Rich The Kid
156,Lil Uzi Vert
