# Source Dataset preparation for Transfer Learning

In [60]:
import pandas as pd
import re
from num2words import num2words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elequaranta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = pd.read_csv('EdmondsDance.csv', index_col=0)

In [5]:
data

Unnamed: 0,Song,Artists,Lyrics,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation,Unnamed: 11
0,Apollo,"Hardwell, Amba Shepherd",Just one day in the life<br>So I can understan...,1,1,0,1,0,0,0,0,
1,Lullaby,"R3HAB, Mike Williams","Hypnotized, this love out of me<br>Without you...",0,0,1,0,1,0,0,0,
2,Melody (Tip Of My Tongue),Mike Williams,I stand a little too close<br>You stare a litt...,1,1,0,0,0,0,0,1,
3,Take Me Home,"Cash Cash, Bebe Rexha",I'm falling to pieces<br>But I need this<br>Ye...,0,0,0,1,1,1,0,0,
4,City of Dreams,"Dirty South, Alesso","Everything seems like a city of dreams,<br>I n...",0,0,0,1,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
519,Ashes To Ashes (Remix),"Tigerlily, Noah Neiman",ashes to ashes<br>we're falling down<br>so we ...,0,0,0,0,0,1,1,0,
520,Midnight,Third Party,I want to hold you<br>I want to hold you<br>I ...,0,0,0,0,0,0,0,1,
521,Chicago (Remix),"Win and Woo, Bryce Fox, SHADES",There's not enough room in here<br>For room fo...,0,0,0,1,1,0,0,0,
522,Haunted,PATAY,I see you everywhere<br>I never moved on<br>Wi...,0,0,0,0,1,0,0,0,


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 524 entries, 0 to 523
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Song          524 non-null    object 
 1   Artists       523 non-null    object 
 2   Lyrics        524 non-null    object 
 3   Joy           524 non-null    int64  
 4   Trust         524 non-null    int64  
 5   Fear          524 non-null    int64  
 6   Surprise      524 non-null    int64  
 7   Sadness       524 non-null    int64  
 8   Disgust       524 non-null    int64  
 9   Anger         524 non-null    int64  
 10  Anticipation  524 non-null    int64  
 11  Unnamed: 11   1 non-null      float64
dtypes: float64(1), int64(8), object(3)
memory usage: 53.2+ KB


In [11]:
data = data.drop(['Song', 'Artists', 'Unnamed: 11'], axis=1)

The lyrics in the destination dataset have been preprocessed as follow; Lyrics after lower-casing, removing superfluous white spaces, consecutive newlines, and annotation (e.g., [guitar] or [chorus]), duplicating segments (e.g., [3x] or [x2]), translating lyrics from other languages into English, replacing numbers with English words, substituting English contractions with spelled-out forms, removing special characters and stopwords, applying lemmatization and stemming. <br>
In order to achieve reliable labeling, it is useful to perform the same operations on the lyrics contained oin this dataset. 


In [13]:
lyrics_col = list(data['Lyrics'])

In [14]:
lyrics_col 

['Just one day in the life<br>So I can understand<br>Fighting just to survive<br>But you taught me I can<br>We are the lucky ones<br>We are, we are<br>Oh we are the lucky ones<br>We are, we are Just one day in the life<br>So I can understand<br>Fighting just to survive<br>But you taught me I can<br>We are the lucky ones<br>We are, we are<br>Oh we are the lucky ones<br>We are, we are<br>We are the lucky ones<br>We are, we are<br>Oh, we are the lucky ones<br>We are, we are',
 "Hypnotized, this love out of me<br>Without your air I can't even breathe<br>Lead my way out into the light<br>Sing your lullaby<br>Cherries in the ashtray<br>Take me through the day<br>I just gotta make you drunk in memory<br>See you in the puddles<br>Of my Chardonnay<br>Sleeping in my bathtub<br>But can wish you were late<br>Keep me safe up in the clouds<br>'Cause I can't come raining down<br>Make the monsters sleep in my mind<br>Sing your lullaby<br>Hypnotized, this love out of me<br>Without your air I can't even

In [66]:
def _conv_num(match):
    return num2words(match)

def numbers_to_words(text):
    return re.sub(r'\b\d+\b', lambda m: _conv_num(m.group()), text)


def lemmatization_stemming_sw(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    sw = stopwords.words('english')
    sw.extend(['oh', 'na', 'la', 'eh', 'ah', 'yeah'])
    filtered_list = [stemmer.stem(lemmatizer.lemmatize(word)) for word in word_tokenize(text) if word not in sw]
    filtered_lyr = ' '.join(filtered_list)
    return filtered_lyr


def clean_lyrics(lyr):
    lyr = lyr.lower().strip()
    lyr = re.sub(r'<br>', ' ', lyr)
    lyr = re.sub(r'\n\n+', ' ', lyr)
    lyr = re.sub(r'\[[^\]]+\]', '', lyr)
    lyr = numbers_to_words(lyr)
    lyr = re.sub(r'[^\w\b]', ' ', lyr)
    lyr = re.sub(r'\s{2,}', ' ', lyr)
    lyr = lemmatization_stemming_sw(lyr)
    return lyr
    
    
    
    


In [67]:
clean_lyrics_col = list()
for song_lyrics in lyrics_col:
    clean_lyrics_col.append(clean_lyrics(song_lyrics))

In [68]:
clean_lyrics_col

['one day life understand fight surviv taught lucki one lucki one one day life understand fight surviv taught lucki one lucki one lucki one lucki one',
 'hypnot love without air even breath lead way light sing lullabi cherri ashtray take day got ta make drunk memori see puddl chardonnay sleep bathtub wish late keep safe cloud caus come rain make monster sleep mind sing lullabi hypnot love without air even breath lead way light sing lullabi cherri ashtray take day got ta make drunk memori see puddl chardonnay sleep bathtub wish late keep safe cloud caus come rain make monster sleep mind sing lullabi',
 'stand littl close stare littl long danc everi time air like smoke lip find word trace kiss tast tast tast mind time race shot love tip tongu tip tongu',
 'fall piec need need fault weak turn cold cut bone danc soul fall piec piec piec still stay caus thing know take take home take home home home take home home home take take take take take home home home home round circl go highest high 

In [69]:
data.drop('Lyrics', axis=1)

Unnamed: 0,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation
0,1,1,0,1,0,0,0,0
1,0,0,1,0,1,0,0,0
2,1,1,0,0,0,0,0,1
3,0,0,0,1,1,1,0,0
4,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...
519,0,0,0,0,0,1,1,0
520,0,0,0,0,0,0,0,1
521,0,0,0,1,1,0,0,0
522,0,0,0,0,1,0,0,0


In [70]:
data['Lyrics'] = clean_lyrics_col

In [71]:
data

Unnamed: 0,Lyrics,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation
0,one day life understand fight surviv taught lu...,1,1,0,1,0,0,0,0
1,hypnot love without air even breath lead way l...,0,0,1,0,1,0,0,0
2,stand littl close stare littl long danc everi ...,1,1,0,0,0,0,0,1
3,fall piec need need fault weak turn cold cut b...,0,0,0,1,1,1,0,0
4,everyth seem like citi dream never know still ...,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
519,ash ash fall pa burn need someon breath know b...,0,0,0,0,0,1,1,0
520,want hold want hold want hold midnight want ho...,0,0,0,0,0,0,0,1
521,enough room room error think prepar never scar...,0,0,0,1,1,0,0,0
522,see everywher never move thought run head crus...,0,0,0,0,1,0,0,0


In [72]:
data.to_csv('clean_EdmondsDance.csv')