In [1]:
import pandas as pd

#### Format Lyric Data

In [2]:
# read in lyrics data
lyrics_df = pd.read_csv('lyrics-data.csv')

# preview the raw data 
lyrics_df.head(5)

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [3]:
def clean_artist_name(name: str) -> str:
    """
    Formats the column with the artist's name. Ensures that this column is formatted consistently, as it will be used to merge datasets.
    Old format is '/firstname-lastname/', update to 'firstname lastname' 
    """
    name = name.lower()
    name = name.replace('-', ' ')
    name = name.replace('/', '')
    return name

In [4]:
# rename columns and drop unnecessary ones 
lyrics_df.rename(columns={'SName': 'song_name', 'Lyric': 'lyrics', 'ALink': 'artist'}, inplace = True)
lyrics_df.drop(columns=['SLink'], inplace=True) 
lyrics_df.dropna(inplace=True)

# clean the artist name
lyrics_df['artist'] = lyrics_df['artist'].apply(clean_artist_name)

# only keep songs in English
lyrics_df = lyrics_df[lyrics_df['language'] == 'en']

# print info about the cleaned lyric data 
print(lyrics_df.shape)
lyrics_df.head(5)

(191812, 4)


Unnamed: 0,artist,song_name,lyrics,language
69,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en
86,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en
88,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en
111,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en
140,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en


#### Format Artist Data

In [5]:
# read in artist data
artist_df = pd.read_csv('artists-data.csv')

# preview the raw data 
artist_df.head(5)

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/


In [6]:
# rename columns and drop unnecessary ones 
artist_df.rename(columns={'Artist': 'artist', 'Genres': 'genres'}, inplace = True)
artist_df.drop(columns=['Popularity', 'Link', 'Songs'], inplace = True)
artist_df.dropna(inplace=True)

# clean the artist name
artist_df['artist'] = artist_df['artist'].apply(clean_artist_name)

# print out info about cleaned data 
print(artist_df.shape)
artist_df.head(5)

(4163, 2)


Unnamed: 0,artist,genres
0,ivete sangalo,Pop; Axé; Romântico
1,chiclete com banana,Axé
2,banda eva,Axé; Romântico; Reggae
3,é o tchan,Axé
4,claudia leitte,Pop; Axé; Romântico


#### Merge Lyric and Artist Datasets

In [7]:
# merge datasets 
df = pd.merge(lyrics_df, artist_df, on='artist', how='inner')
df.dropna(inplace=True)

# turn genres into list
df['genres'] = df['genres'].apply(lambda genres: genres.split(';'))
df.reset_index(drop=True, inplace=True)   
print(df.shape)
df.head(5)

(171855, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"[Pop, Axé, Romântico]"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"[Pop, Axé, Romântico]"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"[Pop, Axé, Romântico]"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"[Pop, Axé, Romântico]"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"[Pop, Axé, Romântico]"


In [8]:
# save data as csv
df.to_csv('clean_data.csv', index=False)