In [1]:
import csv
import numpy as np 
import pandas as pd 

## Genius Lyrics Cleaning 

In [8]:
metal = pd.read_csv("genius_data/metal.csv", index_col=0)
rap = pd.read_csv("genius_data/rappers.csv", index_col=0)
rock = pd.read_csv("genius_data/rockers.csv", index_col=0)
jazz = pd.read_csv("genius_data/jazz.csv", index_col=0)
folk = pd.read_csv("genius_data/folk.csv", index_col=0)
pop = pd.read_csv("genius_data/pop.csv", index_col=0)
rb = pd.read_csv("genius_data/rb.csv", index_col=0)
soul = pd.read_csv("genius_data/soul.csv", index_col=0)
country = pd.read_csv("genius_data/country.csv", index_col=0)

        artist  genre                    title  \
0  Iron Maiden  metal  The Number of the Beast   
1  Iron Maiden  metal         Fear of the Dark   
2  Iron Maiden  metal              The Trooper   
3  Iron Maiden  metal     Hallowed Be Thy Name   
4  Iron Maiden  metal         Run to the Hills   

                                              lyrics  
0  Woe to you, o'er Earth and Sea\nFor the Devil ...  
1  I am a man who walks alone\nAnd when I'm walki...  
2  You'll take my life but I'll take yours too\nY...  
3  I'm waiting in my cold cell, when the bell beg...  
4  White man came across the sea\nHe brought us p...  


In [6]:
frames = [metal, rap, rock, jazz, folk, pop, rb, soul, country]
genius_df = pd.concat(frames)
genius_df = genius_df.loc[:, ~genius_df.columns.str.contains('^Unnamed')]
genius_df.to_csv("raw-genius-data.csv")

genius_df.head(5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,...,Unnamed: 23,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,artist,genre,lyrics,title
0,,,,,,,,,,,...,,,,,,,Iron Maiden,metal,"Woe to you, o'er Earth and Sea\nFor the Devil ...",The Number of the Beast
1,,,,,,,,,,,...,,,,,,,Iron Maiden,metal,I am a man who walks alone\nAnd when I'm walki...,Fear of the Dark
2,,,,,,,,,,,...,,,,,,,Iron Maiden,metal,You'll take my life but I'll take yours too\nY...,The Trooper
3,,,,,,,,,,,...,,,,,,,Iron Maiden,metal,"I'm waiting in my cold cell, when the bell beg...",Hallowed Be Thy Name
4,,,,,,,,,,,...,,,,,,,Iron Maiden,metal,White man came across the sea\nHe brought us p...,Run to the Hills


In [None]:
def clean_genius(genius_df): 
    #take out observations that have NaN for ‘genre’ or ‘lyrics’
    subset = genius_df[['genre','lyrics']]
    genius_df.dropna(subset=['genre', 'lyrics'], inplace=True)
    
    #replace new line with space
    genius_df = genius_df.replace({'\n': ' '}, regex=True)
    
    #get word count
    genius_df['word_num'] = genius_df['lyrics'].str.split().str.len()
    
    #observe that there are lots of songs w/ 1 word
    genius_df.sort_values(by = "word_num").head(100)
    
    #remove entries where only 1 word in song
    genius_df['word_num'].astype('int32')
    genius_df = genius_df[genius_df.word_num != 1]
    
    #anything with rap genius comments is way to long and includes comments
    genius_df = genius_df[~genius_df['lyrics'].str.contains("RAP GENIUS")]
    
    #notice that 124/130 of the songs with "Lyrics are just some string saying how there are no lyrics"
    genius_df = genius_df[genius_df['word_num'] !=18]
    
    #anything less than 10 seems to be junk lyrics
    genius_df = genius_df[genius_df['word_num'] > 10]



In [None]:
clean_genius.genre.unique()

In [None]:
clean_genius(genius_df)
genius_df.head(5)

## Metrolyrics Cleaning

In [None]:
metro = pd.read_csv("lyrics-metrolyrics.csv")

In [None]:
def clean_metro(lyrics):
    #replace new line with space
    lyrics = lyrics.replace({'\n': ' '}, regex=True)
    
    #get word count
    lyrics['word_num'] = lyrics['lyrics'].str.split().str.len()

    #Take out index
    lyrics = lyrics.drop(columns = ['index'])

    #See what values are present for years
    column_values = lyrics[["year"]].values.ravel()
    unique_values =  pd.unique(column_values)
                                    
    #remove the dates that do not make any sense
    bad_year = ['702', '112', '67']
    lyrics = lyrics[~lyrics['year'].isin(bad_year)]

    #See what values are present for genre
    column_values = lyrics[["genre"]].values.ravel()
    unique_values =  pd.unique(column_values)

    #get the counts of each 
    index = pd.Index(lyrics['genre'])
    index.value_counts()
                                    
    #Remove all instrumental songs
    lyrics = lyrics[~lyrics.lyrics.str.contains("instrumental", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("INSTRUMENTAL", na=False)]
    lyrics = lyrics[~lyrics.lyrics.str.contains("[Instrumental]", na=False)]

In [None]:
clean_metro(metro)
del metro['index']
del metro['year']
metro.head(5)

In [None]:
metro.rename(columns = {"song": "title"}, inplace=True)
# reordering columns 
metro = metro[['artist', 'genre', 'title', 'lyrics']]
metro['title'] = metro['title'].str.replace('-',' ')
metro.head(5)

# Combining datasets into BIG

In [None]:
frames_1 = [metro, genius_df]
master_df = pd.concat(frames_1)

In [None]:
master_df.shape
master_df.to_csv("master-data.csv")

# Cleaning Non-English Lyrics out of Dataset

In [None]:
from langdetect import detect
import pandas as pd 
import numpy as np 

In [None]:
data = pd.read_csv("master-data.csv")
del data['Unnamed: 0']
print(data.head())

In [None]:
# Get rid of all rows whose lyrics column is NaN
data = data[data['lyrics'].notnull()]
non_strings = 0
bad_indices = []
for index, value in data['lyrics'].items():
    if type(value) != str:
        non_strings = non_strings + 1
        bad_indices.append(index)
# print(data[type(data['lyrics']) != str].shape )
print(non_strings)
print(data.shape)
print(len(bad_indices))

In [None]:
print(len(data['lyrics']))

In [None]:
def language_detector(string):
    global i
    try:
        res = detect(string)
    except:
        res = "undetectable"
    if i % 1000 == 0:
        print(i)
    i = i + 1
    return res

In [None]:
i = 0
data['language'] = data['lyrics'].apply(language_detector)
data = data[data['language'] == "en"]
data.to_csv("language-processed-data.csv")

# Turning into master CSV 

In [None]:
data['t-lyric'] = data['title'] + " @@@ " + master_df['lyrics']
data['t-lyric'] = data['t-lyric'].str.lower()

data.head()

In [None]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
#     text = text.replace('\n', ' ').replace('\t','')
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText

# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList

In [None]:
# Preprocess 
data['lyrics'] = data['lyrics'].apply(preprocessText)
data['t-lyric'] = data['t-lyric'].apply(preprocessText)
data.head(5)

In [None]:
# turning the italicized characters into regular 

In [None]:
data['t-lyric'] = data['t-lyric'].apply(corpusToList)
data.head(5)

In [None]:
data.to_csv("master-process-data.csv")