In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk import word_tokenize
import boto3
from s3 import get_file
import re
import mglearn
import sklearn
import yellowbrick
from nltk.corpus import stopwords

In [2]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [4]:
s3 = boto3.resource('s3')
lyrics = get_file(s3,'s3ssp',download_file='NLP_Data/rough_lyrics_audio_features.csv',rename_file='0320_2020_nlp.csv')

In [17]:
df = pd.read_csv(lyrics,sep='|',encoding='utf-8')
df.head()
df = df.drop_duplicates(['artist_uri_x','track_uri'])
df_copy = df.copy().dropna()

In [18]:
df_copy = df_copy.drop(columns=['instrumentalness','key','mode','liveness','speechiness','time_signature',
                                'tempo','loudness'])

## Clean of stop words from NLTK and Github Song Stopwords Resource

In [20]:
df_copy.head(1)

Unnamed: 0,artist_uri_x,artist_name,track_uri,track_name,lyrics,acousticness,danceability,energy,valence
0,6vWDO969PvNqNYHIOW5v0m,Beyoncé,5Ui8M6tfknhXo4MuGHt3Dy,why don't you love me,"N-n-now, honey\nYou better sit down and look a...",0.0463,0.693,0.73,0.741


In [21]:
df_lastfm_stopwords = pd.read_csv('lastfm-lyrics-analysis/stopwords.txt',header=None)

In [22]:
lyrics_list = df_copy['lyrics'].str.replace('\n',' ')
df_copy['lyrics'] = lyrics_list.to_frame()
lastfm_stopwords = df_lastfm_stopwords[0].str.replace('\n',' ')
df_lastfm_stopwords['last_fm_stopwords'] = lastfm_stopwords.to_frame()
#df_clean = .str.replace('(\[.*\])*','')

In [23]:
df_lastfm_stopwords.head()

Unnamed: 0,0,last_fm_stopwords
0,a,a
1,able,able
2,about,about
3,above,above
4,abst,abst


In [24]:
df_lastfm_stopwords_list = df_lastfm_stopwords['last_fm_stopwords'].tolist()

In [25]:
type(df_lastfm_stopwords_list)

list

In [26]:
df_english = pd.read_csv("stop-words/english.txt",header=None)

In [27]:
github_english_stopwords = df_english[0].str.replace('\n',' ')
df_english['github_english_stopwords'] = github_english_stopwords.to_frame()
github_english_stopwords_list = df_english['github_english_stopwords'].tolist()

In [28]:
type(github_english_stopwords_list)

list

In [29]:
df_spanish = pd.read_csv("stop-words/spanish.txt",header=None)
github_spanish_stopwords = df_spanish[0].str.replace('\n',' ')
df_spanish['github_spanish_stopwords'] = github_spanish_stopwords.to_frame()
github_spanish_stopwords_list = df_spanish['github_spanish_stopwords'].tolist()

In [30]:
type(github_spanish_stopwords_list)

list

## Clean from Instrumentals

In [31]:
#Clean dataset of instrumentals
df_playlist = df_copy.copy()

df_playlist['instrumental'] = df_playlist['lyrics'].apply(lambda x: x.find('instrumental'))
df_playlist = df_playlist.drop(df_playlist[df_playlist['instrumental'] != -1].index)
#df = df.drop(df[df['lyrics']=='This is an instrumental'].index)

In [32]:
df_playlist['Instrumental'] = df_playlist['lyrics'].apply(lambda x: x.find('Instrumental'))
df_playlist =df_playlist.drop(df_playlist[df_playlist['Instrumental'] != -1].index)

In [33]:
df_playlist['INSTRUMENTAL'] = df_playlist['lyrics'].apply(lambda x: x.find('INSTRUMENTAL'))
df_playlist =df_playlist.drop(df_playlist[df_playlist['INSTRUMENTAL'] != -1].index)

In [34]:
df_playlist[df_playlist['lyrics'] == 'INSTRUMENTAL']

Unnamed: 0,artist_uri_x,artist_name,track_uri,track_name,lyrics,acousticness,danceability,energy,valence,instrumental,Instrumental,INSTRUMENTAL


In [35]:
def preprocessor(text,source):
    tokens = word_tokenize(text)
    return (" ").join([word for word in tokens if word not in source])

In [36]:
df_demo = df_playlist

df_demo['git_english_lyrics'] = df_demo['lyrics'].apply(lambda x: preprocessor(x,github_english_stopwords_list))
df_demo['git_spanish_lyrics'] = df_demo['git_english_lyrics'].apply(lambda x: preprocessor(x,github_spanish_stopwords_list))
df_demo['last_fm_lyrics'] = df_demo['git_spanish_lyrics'].apply(lambda x: preprocessor(x,df_lastfm_stopwords_list))
df_demo['nltk_lyrics'] = df_demo['last_fm_lyrics'].apply(lambda x: preprocessor(x,stopwords.words('english')))




In [37]:
df_demo.head()


Unnamed: 0,artist_uri_x,artist_name,track_uri,track_name,lyrics,acousticness,danceability,energy,valence,instrumental,Instrumental,INSTRUMENTAL,git_english_lyrics,git_spanish_lyrics,last_fm_lyrics,nltk_lyrics
0,6vWDO969PvNqNYHIOW5v0m,Beyoncé,5Ui8M6tfknhXo4MuGHt3Dy,why don't you love me,"N-n-now, honey You better sit down and look ar...",0.0463,0.693,0.73,0.741,-1,-1,-1,"N-n-now , honey You sit 'Cause bumped yo ' hea...","N-n-now , honey You sit 'Cause bumped ' head A...","N-n-now , honey You sit 'Cause bumped ' head A...","N-n-now , honey You sit 'Cause bumped ' head A..."
1,6vWDO969PvNqNYHIOW5v0m,Beyoncé,5dhPqcLr5EcSd7Fe4fslCq,save the hero,I lay alone awake at night Sorrow fills my eye...,0.673,0.551,0.467,0.148,-1,-1,-1,I lay awake night Sorrow fills eyes But I 'm s...,I lay awake night Sorrow fills eyes But I 'm s...,I lay awake night Sorrow fills eyes But I 'm s...,I lay awake night Sorrow fills eyes But I 'm s...
2,6vWDO969PvNqNYHIOW5v0m,Beyoncé,5dWTQXVHdoIsSLpEyS3woy,broken-hearted girl,Youre everything I thought you never were And ...,0.503,0.336,0.424,0.159,-1,-1,-1,Youre I And I couldve But live So ? Youre I I ...,Youre I And I couldve But live So ? Youre I I ...,Youre I And I couldve But live So ? Youre I I ...,Youre I And I couldve But live So ? Youre I I ...
3,6vWDO969PvNqNYHIOW5v0m,Beyoncé,0QRxJvOohS8yiGC1n98uFM,lift ev'ry voice and sing - homecoming live,He is always laughin' and flirting with me And...,0.124,0.127,0.409,0.0782,-1,-1,-1,He laughin ' flirting And n't care As matter r...,He laughin ' flirting And n't care As matter r...,He laughin ' flirting And n't care As matter r...,He laughin ' flirting And n't care As matter r...
4,6vWDO969PvNqNYHIOW5v0m,Beyoncé,4DActPOAtak2m8meZeMt3B,no angel,"This is for my fans (Uhu, uhu) This is for my ...",0.0424,0.571,0.466,0.512,-1,-1,-1,"This fans ( Uhu , uhu ) This destiny ( Uhu , u...","This fans ( Uhu , uhu ) This destiny ( Uhu , u...","This fans ( Uhu , uhu ) This destiny ( Uhu , u...","This fans ( Uhu , uhu ) This destiny ( Uhu , u..."


In [44]:
df_topic = df_demo[['artist_name','track_name','nltk_lyrics','acousticness','danceability','energy','valence']]
df_topic = df_topic.rename(columns={'nltk_lyrics':'lyrics'})
df_topic.describe()

Unnamed: 0,acousticness,danceability,energy,valence
count,96906.0,96906.0,96906.0,96906.0
mean,0.226683,0.501299,0.683859,0.458357
std,0.298966,0.169198,0.24388,0.243246
min,0.0,0.0,0.0,0.0
25%,0.00257,0.38225,0.511,0.259
50%,0.0611,0.505,0.735,0.438
75%,0.386,0.622,0.896,0.647
max,0.996,0.985,1.0,0.996


In [45]:
#df_topic[(df_topic['artist_name']=='Arcade Fire') & (df_topic['track_name']=='here comes the night time')]

In [46]:
df_topic.to_csv('Data/master_lyrics_audio_features.csv',index=False,sep='|')