In [1]:
import csv
import numpy as np 
import pandas as pd 

## Genius Lyrics Cleaning 

In [2]:
metal = pd.read_csv("genius_data/metal.csv", index_col=0)
rap = pd.read_csv("genius_data/rappers.csv", index_col=0)
rock = pd.read_csv("genius_data/rockers.csv", index_col=0)
jazz = pd.read_csv("genius_data/jazz.csv", index_col=0)
folk = pd.read_csv("genius_data/folk.csv", index_col=0)
pop = pd.read_csv("genius_data/pop.csv", index_col=0)
rb = pd.read_csv("genius_data/rb.csv", index_col=0)
soul = pd.read_csv("genius_data/soul.csv", index_col=0)
country = pd.read_csv("genius_data/country.csv", index_col=0)

In [3]:
rap.genre.unique()

array(['rap'], dtype=object)

In [4]:
frames = [metal, rap, rock, jazz, folk, pop, rb, soul, country]
genius_df = pd.concat(frames)
genius_df = genius_df.loc[:, ~genius_df.columns.str.contains('^Unnamed')]
genius_df.to_csv("raw-genius-data.csv")

genius_df.head(5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,artist,genre,lyrics,title
0,Iron Maiden,metal,"Woe to you, o'er Earth and Sea\nFor the Devil ...",The Number of the Beast
1,Iron Maiden,metal,I am a man who walks alone\nAnd when I'm walki...,Fear of the Dark
2,Iron Maiden,metal,You'll take my life but I'll take yours too\nY...,The Trooper
3,Iron Maiden,metal,"I'm waiting in my cold cell, when the bell beg...",Hallowed Be Thy Name
4,Iron Maiden,metal,White man came across the sea\nHe brought us p...,Run to the Hills


In [5]:
def clean_genius(genius_df): 
    #take out observations that have NaN for ‘genre’ or ‘lyrics’
    subset = genius_df[['genre','lyrics']]
    genius_df.dropna(subset=['genre', 'lyrics'], inplace=True)
    
    #replace new line with space
#     genius_df = genius_df.replace({'\n': ' '}, regex=True)
    
    #get word count
    genius_df['word_num'] = genius_df['lyrics'].str.split().str.len()
    
    #observe that there are lots of songs w/ 1 word
    genius_df.sort_values(by = "word_num").head(100)
    
    #remove entries where only 1 word in song
    genius_df['word_num'].astype('int32')
    genius_df = genius_df[genius_df.word_num != 1]
    
    #anything with rap genius comments is way to long and includes comments
    genius_df = genius_df[~genius_df['lyrics'].str.contains("RAP GENIUS")]
    
    #notice that 124/130 of the songs with "Lyrics are just some string saying how there are no lyrics"
    genius_df = genius_df[genius_df['word_num'] !=18]
    
    #anything less than 10 seems to be junk lyrics
    genius_df = genius_df[genius_df['word_num'] > 10]


In [6]:
genius_df.genre.unique()

array(['metal', 'rap', 'rock', 'jazz', 'folk', 'pop', 'rb', 'soul',
       'country'], dtype=object)

In [7]:
clean_genius(genius_df)

NameError: name 'cleaned' is not defined

In [8]:
genius_df.to_csv("cleaned_genius.csv")

## Metrolyrics Cleaning

In [9]:
metro = pd.read_csv("lyrics-metrolyrics.csv")

In [10]:
def clean_metro(lyrics):
    #replace new line with space
    lyrics = lyrics.replace({'\n': ' '}, regex=True)
    
    #get word count
    lyrics['word_num'] = lyrics['lyrics'].str.split().str.len()

    #Take out index
    lyrics = lyrics.drop(columns = ['index'])

    #See what values are present for years
    column_values = lyrics[["year"]].values.ravel()
    unique_values =  pd.unique(column_values)
                                    
    #remove the dates that do not make any sense
    bad_year = ['702', '112', '67']
    lyrics = lyrics[~lyrics['year'].isin(bad_year)]

    #See what values are present for genre
    column_values = lyrics[["genre"]].values.ravel()
    unique_values =  pd.unique(column_values)

    #get the counts of each 
    index = pd.Index(lyrics['genre'])
    index.value_counts()
                                    
    #Remove all instrumental songs
    lyrics = lyrics[~lyrics.lyrics.str.contains("instrumental", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("INSTRUMENTAL", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("[Instrumental]", na=False)]

In [11]:
clean_metro(metro)
del metro['index']
del metro['year']
metro.head(5)

Unnamed: 0,song,artist,genre,lyrics
0,ego-remix,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [12]:
metro.rename(columns = {"song": "title"}, inplace=True)
# reordering columns 
metro = metro[['artist', 'genre', 'title', 'lyrics']]
metro['title'] = metro['title'].str.replace('-',' ')
metro.head(5)

Unnamed: 0,artist,genre,title,lyrics
0,beyonce-knowles,Pop,ego remix,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,beyonce-knowles,Pop,then tell me,"playin' everything so easy,\nit's like you see..."
2,beyonce-knowles,Pop,honesty,If you search\nFor tenderness\nIt isn't hard t...
3,beyonce-knowles,Pop,you are my rock,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,beyonce-knowles,Pop,black culture,"Party the people, the people the party it's po..."


# Combining datasets into BIG

In [13]:
frames_1 = [metro, genius_df]
master_df = pd.concat(frames_1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [14]:
master_df.shape
master_df.to_csv("master-data.csv")

# Cleaning Non-English Lyrics out of Dataset

In [18]:
from langdetect import detect
import pandas as pd 
import numpy as np 

In [19]:
data = pd.read_csv("master-data.csv")
del data['Unnamed: 0']
print(data.head())

            artist genre                                             lyrics  \
0  beyonce-knowles   Pop  Oh baby, how you doing?\nYou know I'm gonna cu...   
1  beyonce-knowles   Pop  playin' everything so easy,\nit's like you see...   
2  beyonce-knowles   Pop  If you search\nFor tenderness\nIt isn't hard t...   
3  beyonce-knowles   Pop  Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...   
4  beyonce-knowles   Pop  Party the people, the people the party it's po...   

             title  word_num  
0        ego remix       NaN  
1     then tell me       NaN  
2          honesty       NaN  
3  you are my rock       NaN  
4    black culture       NaN  


In [20]:
# Get rid of all rows whose lyrics column is NaN
data = data[data['lyrics'].notnull()]
non_strings = 0
bad_indices = []
for index, value in data['lyrics'].items():
    if type(value) != str:
        non_strings = non_strings + 1
        bad_indices.append(index)
# print(data[type(data['lyrics']) != str].shape )
print(non_strings)
print(data.shape)
print(len(bad_indices))

0
(327860, 5)
0


In [21]:
print(len(data['lyrics']))

327860


In [22]:
def language_detector(string):
    global i
    try:
        res = detect(string)
    except:
        res = "undetectable"
    if i % 2000 == 0:
        print(i)
    i = i + 1
    return res

In [23]:
i = 0
data['language'] = data['lyrics'].apply(language_detector)
data = data[data['language'] == "en"]
data.to_csv("language-processed-data.csv")

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000
160000
162000
164000
166000
168000
170000
172000
174000
176000
178000
180000
182000
184000
186000
188000
190000
192000
194000
196000
198000
200000
202000
204000
206000
208000
210000
212000
214000
216000
218000
220000
222000
224000
226000
228000
230000
232000
234000
236000
238000
240000
242000
244000
246000
248000
250000
252000
254000
256000
258000
260000
262000
264000
266000
268000
270000
272000
274000
276000
278000
280000
282000
284000
286000
288000
290000
292000
294000
296000
298000
300000
3