In [1]:
#general packages
import os
import string
import copy
import json
import pickle
from ast import literal_eval
import operator

#dataframe and data science packages
import pandas as pd
import numpy as np
import re

#nltk packages
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

#gensim packages
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel

#emotion detection packages
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import text2emotion as te

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
%run CleanLyricsAndTabsAndKeys.ipynb
%run FeatureReduction.ipynb
%run EmotionGenerator.ipynb

# Import and Combine the Data
1. Import the files
    a. Spotify Data
    b. Dirty Tab Data
    c. Dirty Song Data

## Import the Spotify & Tab Files

In [3]:
spotify_df = pd.read_csv('playlist_tracks.csv')
tab_df = pd.read_csv('track_with_tabs_new.csv')

## Add the Tab Data to the Spotify Data


In [4]:
#make sure both the spotify and tab df have the same index & join
spotify_df = spotify_df.set_index('track_id')
spotify_df = spotify_df[~spotify_df.index.duplicated(keep='first')]

tab_df = tab_df.set_index('track_id')
tab_df = tab_df[~tab_df.index.duplicated(keep='first')]

df = pd.concat([spotify_df, tab_df], join='inner', axis=1)
df = df.rename(columns={'tabs':'dirty_tabs'})
df = df.reset_index()
print(df.shape)

(13530, 18)


## Add the Dirty Lyrics to the DF

In [5]:
import os
lyrics_path = 'lyrics_text/'
for filename in os.listdir(lyrics_path):
    filename.replace('output2','')
    try:
        if '.txt' in filename:
            file = open('lyrics_text/'+filename)
            song_id = filename.replace('.txt','')
            df.loc[df.track_id == song_id, 'dirty_lyrics'] = file.read()
    except:
        print('Failed to Upload: {}'.format(filename))
        pass
#remove the cases where we have no lyrics
df = df.dropna()
print(df.shape)

(8008, 19)


## Clean the Dirty Lyrics & Generate Words for LDA

In [6]:
print('Cleaning Lyrics -> ', end='')
#functions come from CleanLyricsAndTabsAndKeys.ipynb
df = removeLongSongs(df, 92)
#df.shape
df['clean_lyrics'] = df['dirty_lyrics'].apply(lambda x: clean_lyrics(str(x)))
df['words_for_LDA'] = df['clean_lyrics'].apply(lambda x: lyrics_to_words(str(x)))

print('Lyrics Cleaned')
print(df.shape)

Cleaning Lyrics -> Lyrics Cleaned
(7367, 21)


## Clean the Dirty Tabs

In [7]:
print('Cleaning Tab -> ', end='')

df['clean_tabs'] = df['dirty_tabs'].apply(lambda x: convertToSharp(x))
df['clean_tabs'] = df.apply(lambda x: moveForCapo(x['clean_tabs'], x['capo']), axis=1)

print('Tab Cleaned')

Cleaning Tab -> Tab Cleaned


In [8]:
df = df.reset_index()

## Get the Greer, First Chord Keys and Last Chord Keys

In [9]:
df = df.rename(columns={'key': 'spotify_key'})

In [10]:
print('Building Key Data -> ', end='')

df['greer_key'] = df['clean_tabs'].apply(lambda x: findTonicNumNo7(x.split(',')))
df['firstNote_key'] = df['clean_tabs'].apply(lambda x: ChordToNum(sepExt(x.split(',')[0])[0]))
df['lastNote_key'] = df['clean_tabs'].apply(lambda x: ChordToNum(sepExt(x.split(',')[-1])[0]))
df['bestChoice_key'] = df.apply(lambda x: pickMainKey(x), axis=1)

print('Key Data Built')

Building Key Data -> Key Data Built


In [11]:
df

Unnamed: 0,index,track_id,playlist_name,playlist_id,playlist_genre,track_name,track_artist_name,track_artist_id,danceability,energy,...,dirty_tabs,capo,dirty_lyrics,clean_lyrics,words_for_LDA,clean_tabs,greer_key,firstNote_key,lastNote_key,bestChoice_key
0,0,3YBZIN3rekqsKxbJc9FZko,Rock Classics,37i9dQZF1DWXRqgorJj26U,Rock,Paradise City,Guns N' Roses,3qm84nBOXUEQ2vnTfUTTFC,0.273,0.952,...,"G,C,F,C,G,G5,F5,C5,Bb5,C5,C5,Bb5,G,F,G,G,G,C,C...",0,[Chorus]\nTake me down to the Paradise City\nW...,\nTake me down to the Paradise City\nWhere the...,"[take, paradise, city, grass, green, girl, pre...","G,C,F,C,G,G5,F5,C5,A#5,C5,C5,A#5,G,F,G,G,G,C,C...",0,7,7,7
1,1,2zYzyRzz6pRmhPzyfMEC8s,Rock Classics,37i9dQZF1DWXRqgorJj26U,Rock,Highway to Hell,AC/DC,711MCceyCBcFnzjGY4Q7Un,0.574,0.913,...,"A,D/F#,G,D/F#,G,D/F#,G,D/F#,A,A,A,A,D/F#,D/F#,...",0,[Instrumental Intro]\n\n[Verse 1]\nLiving easy...,"\n\n\nLiving easy, lovin' free\nSeason ticket ...","[living, easy, lovin, free, ticket, ride, aski...","A,D/F#,G,D/F#,G,D/F#,G,D/F#,A,A,A,A,D/F#,D/F#,...",2,9,9,9
2,2,5MxNLUsfh7uzROypsoO5qe,Rock Classics,37i9dQZF1DWXRqgorJj26U,Rock,Dream On,Aerosmith,7Ey4PD4MYsKc5I2dolUwbH,0.307,0.433,...,"Fm,Cm/F,Fm6,Bbm6,Fm,C7sus,Fm,Fm,Fm7,Fm6,Bbm6,F...",0,[Verse 1]\nEvery time​ that I look in the mirr...,\nEvery time​ that I look in the mirror\nAll t...,"[time, look, mirror, line, face, getting, clea...","Fm,Cm/F,Fm6,A#m6,Fm,C7sus,Fm,Fm,Fm7,Fm6,A#m6,F...",8,5,0,8
3,3,70LcF31zb1H0PyJoS1Sx1r,Rock Classics,37i9dQZF1DWXRqgorJj26U,Rock,Creep,Radiohead,4Z8W4fKeB5YxbusRsdQVPb,0.515,0.430,...,"G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G...",0,[Verse 1]\nWhen you were here before\nCouldn't...,\nWhen you were here before\nCouldn't look you...,"[look, eye, skin, cry, float, feather, beautif...","G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G...",0,7,7,7
4,4,4bHsxqR3GMrXTxEPLuK5ue,Rock Classics,37i9dQZF1DWXRqgorJj26U,Rock,Don't Stop Believin',Journey,0rvjqX7ttXeg3mTy8Xscbt,0.500,0.748,...,"E,B,C#m,A,E,B,G#m,A,E,B,C#m,A,E,B,G#m,A,E,B,C#...",0,[Verse 1]\nJust a small-town girl\nLivin' in a...,\nJust a small-town girl\nLivin' in a lonely w...,"[small, town, girl, livin, lonely, world, took...","E,B,C#m,A,E,B,G#m,A,E,B,C#m,A,E,B,G#m,A,E,B,C#...",4,4,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,12607,5xLfQuX5YAfog2JgPz0dpA,Sirius XM Top 1000 Classic Rock,7eDSv2WJi5Loto9fIwvthi,MIsc,Moving in Stereo,The Cars,6DCIj8jNaNpBz8e5oKFPtp,0.718,0.543,...,"E,D,D,F#,E,E,D,D,E,E,D,D,F#,E,E,D,C",0,"[Verse 1]\nLife's the same, I'm moving in ster...","\nLife's the same, I'm moving in stereo\nLife'...","[life, moving, stereo, life, shoe, life, shaki...","E,D,D,F#,E,E,D,D,E,E,D,D,F#,E,E,D,C",9,4,0,4
7363,12608,4i6wwcBc3Qrqir83xBdbM6,Sirius XM Top 1000 Classic Rock,7eDSv2WJi5Loto9fIwvthi,MIsc,Saturday Night's Alright (For Fighting),Elton John,3PhoLpVuITZKcymswpck5b,0.490,0.984,...,"G/D,F/C,G,F/G,G,F/G,G/D,F/C,G,F/G,G,F/G,G,F,C,...",0,"[Verse 1]\nIt's getting late, have you seen my...","\nIt's getting late, have you seen my mates?\n...","[getting, late, seen, mate, boy, clock, wanna,...","G/D,F/C,G,F/G,G,F/G,G/D,F/C,G,F/G,G,F/G,G,F,C,...",0,2,0,0
7364,12609,6oO4MqRSMxIDFqX1OQnVAz,Sirius XM Top 1000 Classic Rock,7eDSv2WJi5Loto9fIwvthi,MIsc,Boulevard,Jackson Browne,5lkiCO9UQ8B23dZ1o0UV4m,0.541,0.855,...,"Am,Dm,F,E,Am,Am,Dm,F,E,Am,Am,Dm,F,E,Am,Am,Dm,G...",0,[Verse 1]\nDown on the boulevard they take it ...,\nDown on the boulevard they take it hard\nThe...,"[boulevard, take, hard, look, life, disregard,...","Am,Dm,F,E,Am,Am,Dm,F,E,Am,Am,Dm,F,E,Am,Am,Dm,G...",0,9,9,9
7365,13492,5J4ZkQpzMUFojo1CtAZYpn,Top 10000 songs,04SNbStKKx0pM8CF63RHgX,Misc,Love Me Harder,Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,0.472,0.714,...,"F,Am,Dm,Em,F,Am,Dm,Em,G,F,Am,Dm,Em,G,F,Am,Dm,E...",4,[Verse 1: Ariana Grande]\nTell me something I ...,\nTell me something I need to know\nThen take ...,"[need, take, breath, invade, space, take, plea...","A,C#m,F#m,G#m,A,C#m,F#m,G#m,B,A,C#m,F#m,G#m,B,...",4,9,9,9


# Extract the Emotions from the Song

## Use the Text2Emotion Package

In [12]:
print('Extracting Emotions -> ', end='')

df['text2emotion'] = df['words_for_LDA'].apply(lambda x: buildText2EmotionJson(str(x)))
df['vader_emotion'] = df['words_for_LDA'].apply(lambda x: buildVaderEmotionJson(str(x)))

print('Emotions Extracted')

Extracting Emotions -> Emotions Extracted


# Save Down the Cleaned DF

In [13]:
#df.to_csv('Clean_DF.csv', index=False)
import pickle

pickle_out = open("Clean_DF.pickle","wb")
pickle.dump(df, pickle_out)
pickle_out.close()
