In [3]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [27]:
lyrics_east = json.load(open('lyrics_east_HUGE.json'))
lyrics_west = json.load(open('lyrics_west_HUGE.json'))

In [29]:
# turning the lyrics dictionaries into dataframes - one for each coast.

# max number of song entries belonging to one rapper (over all rappers, east and west)
# this is needed to ensure the dataframe has a common dimensionality
max_len = max(
    [len(lyrics_east[artist]) for artist in list(lyrics_east.keys())] + 
    [len(lyrics_west[artist]) for artist in list(lyrics_west.keys())]
)

# the max number of songs is used to fill the remaining entries for each rapper with nulls
east_lyrics_frame = pd.DataFrame([
    lyrics_east[artist] + [np.nan]*(max_len - len(lyrics_east[artist])) for artist in list(lyrics_east.keys())
], index=list(lyrics_east.keys())).T

west_lyrics_frame = pd.DataFrame([
    lyrics_west[artist] + [np.nan]*(max_len - len(lyrics_west[artist])) for artist in list(lyrics_west.keys())
], index=list(lyrics_west.keys())).T

creating a new dataframe, with each row entry containing the rapper name, the lyrics, and the relevant coast.

In [32]:
entries = []

for artist in list(east_lyrics_frame.columns):
    
    for song in east_lyrics_frame[artist].dropna():
        
        entries.append([artist, song, 'east'])
        
east_frame = pd.DataFrame(entries, columns=['artist', 'lyrics', 'coast'])

In [33]:
entries = []

for artist in list(west_lyrics_frame.columns):
    
    for song in west_lyrics_frame[artist].dropna():
        
        entries.append([artist, song, 'west'])
        
west_frame = pd.DataFrame(entries, columns=['artist', 'lyrics', 'coast'])

In [34]:
whole_frame = pd.concat([east_frame, west_frame], axis=0, join='outer')
whole_frame.reset_index(drop=True, inplace=True)

In [35]:
whole_frame.tail()

Unnamed: 0,artist,lyrics,coast
3017,Hieroglyphics,"Yes, yes, yes; yesHa, ha, ha, ha, ha, ha, ha\n...",west
3018,Hieroglyphics,"Pep Love\nHieroglyphics y'all, what?\nThe mic'...",west
3019,Hieroglyphics,"Yeah, yeah\nWhat's goin downS.O.M., where we a...",west
3020,Hieroglyphics,"Del meister, 'bout to heist ya, hijack ya, com...",west
3021,Hieroglyphics,Drop some new shit on 'em them\nLet's drop som...,west


**Apply the basic clean function**

In [6]:
def basic_clean(lyric):
    #A basic lyric cleaning function
    
    try:
        lyric = re.sub('[%s]' % string.punctuation, '', lyric) # remove punctuation
        lyric = re.sub('\w*\d\w*', '', lyric) # remove numbers and words with numbers in them
        lyric = re.sub('\\n', ' ', lyric).lower() # remove new line characters
        
    except:
        pass
    
    return lyric

In [36]:
clean_frame = whole_frame.copy()
clean_frame['lyrics'] = whole_frame.lyrics.apply(basic_clean)

In [37]:
# collect the indices of those entries where the number of characters in the lyrics is below 400.
# these entries will be dropped, since there many undesirable entries here
short_index = clean_frame[clean_frame.lyrics.apply(lambda x: len(x)) < 400].index

In [41]:
clean_frame[clean_frame.lyrics.apply(lambda x: len(x)) < 400].lyrics.values

array(['lyrics will come up in a bit',
       'a gun is made in america every  seconds  of families with children keep loaded guns in the home in the united states more than ten children are killed by handguns every day guns are weapons dont destroy our childrens lifes',
       ' do you want to get high man i see em does pinochio have wooden balls man well yo i got a joint ive been saving here for a special occasion ahh niggas bitches welcome a full tank of gas a pound of weed a bird called pinky to the east driver to the east funk doctor and the phino by the way this is protected by the red the track and tical with the key oh shit where the keys at',
       'ladies and gentlemen i am your host clark dick we are live from times square in new york city and we are approaching the final seconds of  ladies and gentlemen the millennium is on its way  happy new explosion',
       'in order to keep the broad youre gonna have to not do it to her body you got to do it to her mind man',
       '

In [38]:
to_drop = short_index

In [23]:
for i in to_drop:
    clean_frame.drop(i, inplace=True)

In [25]:
len(clean_frame), len(clean_frame.drop_duplicates('lyrics'))

(2904, 2694)

In [178]:
# drop the duplicates (repeated song entries)
clean_frame.drop_duplicates('lyrics', inplace=True)

In [179]:
len(clean_frame[clean_frame.coast == 'east']), len(clean_frame[clean_frame.coast == 'west'])

(1512, 1179)

In [183]:
whole_frame = whole_frame.loc[clean_frame.index]

In [187]:
whole_frame.reset_index(drop=True, inplace=True)

In [7]:
# apply the basic clean
whole_frame['lyrics'] = whole_frame['lyrics'].apply(basic_clean)

In [8]:
whole_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2690 entries, 0 to 2689
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  2690 non-null   object
 1   lyrics  2690 non-null   object
 2   coast   2690 non-null   object
dtypes: object(3)
memory usage: 84.1+ KB


In [9]:
whole_frame.groupby('artist').count()[whole_frame.groupby('artist').count().lyrics < 10]

Unnamed: 0_level_0,lyrics,coast
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Bush Babees,7,7
People Under The Stairs,6,6
Westside Connection,9,9


In [11]:
whole_frame[whole_frame.artist == 'Grandmaster Caz'].lyrics
# drop this entry.

Series([], Name: lyrics, dtype: object)

In [None]:
to_drop = whole_frame[whole_frame.artist == 'Grandmaster Caz'].index

In [None]:
whole_frame.drop(to_drop, inplace=True)
whole_frame.reset_index(drop=True, inplace=True)

In [12]:
whole_frame.to_csv('whole_frame.csv')
# no basic clean now for the lyrics dataframe going forward...

**^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^**

**^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^**