In [1]:
import csv
import numpy as np 
import pandas as pd 

## Genius Lyrics Cleaning 

In [2]:
metal = pd.read_csv("genius_data/metal.csv")
rap = pd.read_csv("genius_data/rappers.csv")
rock = pd.read_csv("genius_data/rockers.csv")
jazz = pd.read_csv("genius_data/jazz.csv")
folk = pd.read_csv("genius_data/folk.csv")
pop = pd.read_csv("genius_data/pop.csv")
rb = pd.read_csv("genius_data/rb.csv")
soul = pd.read_csv("genius_data/soul.csv")

In [3]:
frames = [metal, rap, rock, jazz, folk, pop, rb, soul]
genius_df = pd.concat(frames)
genius_df.to_csv("raw-genius-data.csv")
del genius_df['Unnamed: 0']
genius_df.head(5)

Unnamed: 0,artist,genre,title,lyrics
0,Brad Paisley,rap,Whiskey Lullaby,She put him out\nLike the burning end of a mid...
1,Brad Paisley,rap,Accidental Racist,To the man that waited on me at the Starbucks ...
2,Brad Paisley,rap,Last Time for Everything,Using a fake ID at a college bar\nGetting caug...
3,Brad Paisley,rap,Perfect Storm,If she was a drink\nShe'd be a single-barrelle...
4,Brad Paisley,rap,She’s Everything,She's a yellow pair of running shoes\nA holey ...


In [4]:
def clean_genius(genius_df): 
    #take out observations that have NaN for ‘genre’ or ‘lyrics’
    subset = genius_df[['genre','lyrics']]
    genius_df.dropna(subset=['genre', 'lyrics'], inplace=True)
    
    #replace new line with space
    genius_df = genius_df.replace({'\n': ' '}, regex=True)
    
    #get word count
    genius_df['word_num'] = genius_df['lyrics'].str.split().str.len()
    
    #observe that there are lots of songs w/ 1 word
    genius_df.sort_values(by = "word_num").head(100)
    
    #remove entries where only 1 word in song
    genius_df['word_num'].astype('int32')
    genius_df = genius_df[genius_df.word_num != 1]
    
    #anything with rap genius comments is way to long and includes comments
    genius_df = genius_df[~genius_df['lyrics'].str.contains("RAP GENIUS")]
    
    #notice that 124/130 of the songs with "Lyrics are just some string saying how there are no lyrics"
    genius_df = genius_df[genius_df['word_num'] !=18]
    
    #anything less than 10 seems to be junk lyrics
    genius_df = genius_df[genius_df['word_num'] > 10]



In [5]:
clean_genius(genius_df)
genius_df.head(5)

Unnamed: 0,artist,genre,title,lyrics
0,Brad Paisley,rap,Whiskey Lullaby,She put him out\nLike the burning end of a mid...
1,Brad Paisley,rap,Accidental Racist,To the man that waited on me at the Starbucks ...
2,Brad Paisley,rap,Last Time for Everything,Using a fake ID at a college bar\nGetting caug...
3,Brad Paisley,rap,Perfect Storm,If she was a drink\nShe'd be a single-barrelle...
4,Brad Paisley,rap,She’s Everything,She's a yellow pair of running shoes\nA holey ...


## Metrolyrics Cleaning

In [6]:
metro = pd.read_csv("lyrics-metrolyrics.csv")

In [7]:
def clean_metro(lyrics):
    #replace new line with space
    lyrics = lyrics.replace({'\n': ' '}, regex=True)
    
    #get word count
    lyrics['word_num'] = lyrics['lyrics'].str.split().str.len()

    #Take out index
    lyrics = lyrics.drop(columns = ['index'])

    #See what values are present for years
    column_values = lyrics[["year"]].values.ravel()
    unique_values =  pd.unique(column_values)
                                    
    #remove the dates that do not make any sense
    bad_year = ['702', '112', '67']
    lyrics = lyrics[~lyrics['year'].isin(bad_year)]

    #See what values are present for genre
    column_values = lyrics[["genre"]].values.ravel()
    unique_values =  pd.unique(column_values)

    #get the counts of each 
    index = pd.Index(lyrics['genre'])
    index.value_counts()
                                    
    #Remove all instrumental songs
    lyrics = lyrics[~lyrics.lyrics.str.contains("instrumental", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("INSTRUMENTAL", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("[Instrumental]", na=False)]

In [8]:
clean_metro(metro)
del metro['index']
del metro['year']
metro.head(5)

Unnamed: 0,song,artist,genre,lyrics
0,ego-remix,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [9]:
metro.rename(columns = {"song": "title"}, inplace=True)
# reordering columns 
metro = metro[['artist', 'genre', 'title', 'lyrics']]
metro['title'] = metro['title'].str.replace('-',' ')
metro.head(5)

Unnamed: 0,artist,genre,title,lyrics
0,beyonce-knowles,Pop,ego remix,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,beyonce-knowles,Pop,then tell me,"playin' everything so easy,\nit's like you see..."
2,beyonce-knowles,Pop,honesty,If you search\nFor tenderness\nIt isn't hard t...
3,beyonce-knowles,Pop,you are my rock,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,beyonce-knowles,Pop,black culture,"Party the people, the people the party it's po..."


# Combining datasets into BIG

In [10]:
frames_1 = [metro, genius_df]
master_df = pd.concat(frames_1)

In [11]:
master_df.shape
master_df.to_csv("master-data.csv")

In [12]:
master_df = master_df.replace(np.nan, 'null', regex=True)

In [13]:
master_df['t-lyric'] = master_df['title'] + " @@@ " + master_df['lyrics']
master_df['t-lyric'] = master_df['t-lyric'].str.lower()

master_df.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,"Oh baby, how you doing?\nYou know I'm gonna cu...","ego remix @@@ oh baby, how you doing?\nyou kno..."
1,beyonce-knowles,Pop,then tell me,"playin' everything so easy,\nit's like you see...","then tell me @@@ playin' everything so easy,\n..."
2,beyonce-knowles,Pop,honesty,If you search\nFor tenderness\nIt isn't hard t...,honesty @@@ if you search\nfor tenderness\nit ...
3,beyonce-knowles,Pop,you are my rock,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...","you are my rock @@@ oh oh oh i, oh oh oh i\n[v..."
4,beyonce-knowles,Pop,black culture,"Party the people, the people the party it's po...","black culture @@@ party the people, the people..."


In [14]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
#     text = text.replace('\n', ' ').replace('\t','')
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText

# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList

In [15]:
# Preprocess 
master_df['lyrics'] = master_df['lyrics'].apply(preprocessText)
master_df['t-lyric'] = master_df['t-lyric'].apply(preprocessText)
master_df.head(5)

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,ego remix @@@ oh baby how you doing?\nyou know...
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,then tell me @@@ playin' everything so easy\ni...
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,honesty @@@ if you search\nfor tenderness\nit ...
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,you are my rock @@@ oh oh oh i oh oh oh i\nver...
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,black culture @@@ party the people the people ...


In [None]:
# todo(dlee): getting rid of lyrics that have foreign language characters in them 

In [16]:
master_df['t-lyric'] = master_df['t-lyric'].apply(corpusToList)
master_df.head(5)

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"[ego, remix, @@@, oh, baby, how, you, doing?\n..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"[then, tell, me, @@@, playin', everything, so,..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"[honesty, @@@, if, you, search\nfor, tendernes..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"[you, are, my, rock, @@@, oh, oh, oh, i, oh, o..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"[black, culture, @@@, party, the, people, the,..."


In [17]:
master_df.to_csv("master-process-data.csv")