In [26]:
#! pip install -r ../requirements.txt

In [4]:
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import fasttext

### Initial cleaning and language filtering

In [4]:
df = pd.read_csv('../data/ds2.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5913411 entries, 0 to 5913410
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   title     object
 1   tag       object
 2   artist    object
 3   year      int64 
 4   views     int64 
 5   features  object
 6   lyrics    object
 7   id        int64 
dtypes: int64(3), object(5)
memory usage: 360.9+ MB


In [6]:
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6


In [7]:
# Drop columns that are not needed
df.drop(columns=['views', 'features'], inplace=True)

In [9]:
# Value counts for each genre
df['tag'].value_counts()


tag
pop        2519256
rap        1962010
rock        892220
rb          225342
misc        208714
country     105869
Name: count, dtype: int64

In [10]:
# drop genre "misc" (miscellaneous) from df
df = df[df.tag != 'misc']

In [15]:
# check if only unique genres are in df
df['tag'].unique()

array(['rap', 'rb', 'rock', 'pop', 'country'], dtype=object)

In [17]:
# show nan values in each column
df.isnull().sum()

title      407
tag          0
artist       0
year         0
lyrics    1011
id           0
dtype: int64

In [27]:
# Show the NAn Values in the title column
df[df['title'].isnull()].head(25)

Unnamed: 0,title,tag,artist,year,lyrics,id


In [26]:
# drop nan values in columns lyrics, tag, and title
df.dropna(subset=['lyrics', 'tag', 'title'], inplace=True)


In [31]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id
0,Killa Cam,rap,Cam'ron,2004,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1
1,Can I Live,rap,JAY-Z,1996,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
2,Forgive Me Father,rap,Fabolous,2003,Maybe cause I'm eatin\nAnd these bastards fien...,4
3,Down and Out,rap,Cam'ron,2004,[Produced by Kanye West and Brian Miller]\n\n[...,5
4,Fly In,rap,Lil Wayne,2005,"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6


In [33]:
#df.to_pickle('../data/df_cleaned_1.pkl')

In [5]:
#df = pd.read_pickle('../data/df_cleaned_1.pkl')

In [13]:
# Load FastText's language identification model
model = fasttext.load_model('../fasttext/lid.176.bin')

# Detect language function
def detect_language(text):
    text = text.replace('\n', ' ')  # Replace newline characters with spaces
    predictions = model.predict(text, k=1)  # k is the number of language predictions to return
    lang = predictions[0][0].replace('__label__','')  # Get the language code
    return lang



In [16]:
# Detect language for each row in lyrics column
tqdm.pandas(desc="Detecting language")
df['language'] = df['lyrics'].progress_apply(detect_language)

Detecting language: 100%|██████████| 5703331/5703331 [08:17<00:00, 11472.93it/s]


In [17]:
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,language
0,Killa Cam,rap,Cam'ron,2004,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en
1,Can I Live,rap,JAY-Z,1996,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en
2,Forgive Me Father,rap,Fabolous,2003,Maybe cause I'm eatin\nAnd these bastards fien...,4,en
3,Down and Out,rap,Cam'ron,2004,[Produced by Kanye West and Brian Miller]\n\n[...,5,en
4,Fly In,rap,Lil Wayne,2005,"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en


In [19]:
# keep only english songs
df = df[df['language'] == 'en']

In [24]:
# Save df to pickle
#df.to_pickle('../data/df_cleaned_engl.pkl')

In [None]:
# Read df from pickle
df = pd.read_pickle('../data/df_cleaned_engl.pkl')

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4042601 entries, 0 to 5913410
Data columns (total 7 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   title     object
 1   tag       object
 2   artist    object
 3   year      int64 
 4   lyrics    object
 5   id        int64 
 6   language  object
dtypes: int64(2), object(5)
memory usage: 246.7+ MB
