# Data Cleaning

In [1]:
import pandas as pd
from nltk import RegexpTokenizer, PorterStemmer
import spotipy
import spotipy.oauth2 as oauth2
import time

### Data Dictionary

|Key|Value Type|Value Description|
|---|---|---|
|song|string|Title of the song.|
|danceability|float|Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.|
|energy|float|Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.|
|key|int|The estimated overall key of the track. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1.|
|loudness|float|The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typically range between -60 and 0 db.|
|mode|int|Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.|
|speechiness|float|Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.|
|acousticness|float|A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.|
|instrumentalness|float|Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.|
|liveness|float|Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.|
|valence|float|A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).|
|tempo|float|The overall estimated tempo of a track in beats per minute (BPM).|
|duration_sec|float|The duration of the track in seconds.|
|time_signature|int|An estimated overall time signature of a track. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure).|
|hit|int|1 if a song is a hit (popular) song and 0 if not.|

### Dropped Columns

|Key|Value Type|Value Description|
|---|---|---|
|Unnamed: 0|int|Index from csv|
|type|string|The object type: “audio_features”|
|id|string|The Spotify ID for the track.|
|uri|string|The Spotify URI for the track.|
|track_href|string|A link to the Web API endpoint providing full details of the track.|
|analysis_url|string|An HTTP URL to access the full audio analysis of this track. An access token is required to access this data.|
|duration_ms|int|The duration of the track in milliseconds.

In [305]:
# Set up credentials and token for API environment
credentials = oauth2.SpotifyClientCredentials(
    client_id='ca7d1c8f1d9b4a92a6208ba4984e65f8',       # Client ID provided from developer account page
    client_secret='4fa9bd2b5c204aeab163567faf66d508')   # Client Secret ID provided from developer account page
                                                        # Client Secret ID has since been changed 
token = credentials.get_access_token()

# Create Spotify object
spotify = spotipy.Spotify(auth=token)

### Clean and Merge Audio Feature DataFrames

In [5]:
# Create a function to add song names column to dataframe
def revamp(df):
    column_contents = []                                      # Instantiate empty list
    
    for uri in df['uri']:                                     # For loop to iterate through URIs in URI column
        temp = spotify.track(uri)['name']                     # Spotify API call for track name
        column_contents.append(temp)                          # Append track name to empty list
        time.sleep(3)                                         # Wait 3 seconds before rerunning

    songs = pd.DataFrame(column_contents, columns=['title'])  # Create dataframe of tracks list
    df = df.join(songs, how='outer')                          # Outer join the songs dataframe with the original
    
    # Create a new column with duration column values converted to seconds
    df['duration_sec'] = df['duration_ms'].map(lambda x: x/1000)
    
    # Drop columns that will not be used for analysis
    df.drop(['Unnamed: 0', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms'], 
            axis=1, inplace=True)
    
    # Drop all duplicates
    df.drop_duplicates(inplace=True)
    
    # Reorder columns
    df = df[['title', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 
        'time_signature', 'duration_sec']]
    
    # Return new dataframe
    return df

In [327]:
'''
Created a simplified revamp function to run on songfacts dataframes
which is the same as the revamp function above minus the for loop
'''

def revamp_basic(df1, df2):
    df = df1.join(df2, how='outer')
    df['duration_sec'] = df['duration_ms'].map(lambda x: x/1000)
    df.drop(['Unnamed: 0', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms'], 
            axis=1, inplace=True)
    df = df[['title', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 
        'time_signature', 'duration_sec']]
    
    return df

In [7]:
# Create a function to clean, manipulate, and concatenate dataframes
# The inputs are 2 dataframes 
def process_master(df1, df2):
    
    # Concatenate dataframes
    master_df = pd.concat([df1, df2])
    
    # Drop all duplicates
    master_df.drop_duplicates(inplace=True)
    return master_df

### 2018 audio features

In [10]:
# Read in data
bb_18_ft = pd.read_csv('./data/2018_billboard_features')

In [11]:
# Run revamp function on data
bb_18_ft = revamp(bb_18_ft)

In [242]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_18_ft['hit'] = [1 for i in range(bb_18_ft.shape[0])]

In [246]:
# Check to see data is revamped the way we want it to be
bb_18_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,God's Plan,0.754,0.449,-9.211,1,0.109,0.0332,8.3e-05,0.552,0.357,77.169,7,4,198.973,1
1,Perfect,0.599,0.448,-6.312,1,0.0232,0.163,0.0,0.106,0.168,95.05,8,3,263.4,1
2,Meant to Be (feat. Florida Georgia Line),0.643,0.783,-6.458,1,0.0856,0.047,0.0,0.083,0.579,154.084,10,4,163.87,1
3,Havana,0.765,0.523,-4.333,1,0.03,0.184,3.6e-05,0.132,0.394,104.988,2,4,217.307,1
4,rockstar (feat. 21 Savage),0.587,0.535,-6.09,0,0.0898,0.117,6.6e-05,0.131,0.14,159.847,5,4,218.147,1


In [247]:
# Create new csv file
bb_18_ft.to_csv('./data/REVAMPED_2018_billboard_features')

In [330]:
# Check for unique values
bb_18_ft['title'].value_counts()

Shape of You                                               1
I Like It                                                  1
Trip                                                       1
Mercy                                                      1
Him & I (with Halsey)                                      1
Boo'd Up                                                   1
Nonstop                                                    1
I Like Me Better                                           1
Say Something                                              1
Meant to Be (feat. Florida Georgia Line)                   1
changes                                                    1
God is a woman                                             1
no tears left to cry                                       1
SAD!                                                       1
Havana                                                     1
Look Alive (feat. Drake)                                   1
I Get The Bag (feat. Mig

### 2018 SongFacts audio features

In [66]:
sf_18_ft = pd.read_csv('./data/2018_songfacts_features')

In [67]:
# Check inital shape of sf_18_ft dataframe
sf_18_ft.shape

(1381, 19)

In [159]:
column_contents2 = []                                     # Instantiate empty list

for ID in sf_18_ft['id']:                                 # For loop to iterate through IDs in ID column
    try:
        temp = spotify.track(ID)['name']                  # Spotify API call for track name
    except:
        temp = ['Error']                                  # Except statement, print ['Error'] when errors occur
    column_contents2.append(temp)                         # Append track name to empty list
    time.sleep(2)                                         # Wait 2 seconds before rerunning
    
songs2 = pd.DataFrame(column_contents2, columns=['title'])

In [170]:
# Drop duplicates
songs2.drop_duplicates(inplace=True)

In [332]:
# Check for unique values
songs2['title'].value_counts()

Life is Beautiful                         1
I'd Rather Have a Love                    1
Damage                                    1
Knee Deep In My Heart                     1
Carolina                                  1
Daedalus (What We Have)                   1
Happy Now                                 1
BANG!                                     1
Fluffy                                    1
Hookah                                    1
I Can't Describe (The Way I Feel)         1
Downtown                                  1
Farrah Fawcett Hair                       1
Interlude: I'm Not Angry Anymore          1
Hello                                     1
I'd Want It To Be Yours                   1
Baby Come Back to Me                      1
Introspection (Edit)                      1
Could It Be I'm Falling In Love           1
I Luv This S**t (Feat. $hamrock)          1
Instant Crush                             1
Love & Meth                               1
It's Hard To Win When You Always

In [175]:
# Run the revamp basic function
sf_18_ft = revamp_basic(sf_18_ft, songs2)

In [248]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_18_ft['hit'] = [0 for i in range(sf_18_ft.shape[0])]

In [250]:
sf_18_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,$20 Fine,0.569,0.887,-8.972,1,0.0397,0.0896,0.431,0.345,0.738,120.949,0,4,299.08,0
1,All Hell Breaks Loose,0.441,0.937,-11.576,0,0.0717,0.0354,0.0,0.0924,0.332,156.942,4,4,106.96,0
2,All I Want for Christmas Is You,0.335,0.625,-7.462,1,0.0386,0.164,0.0,0.0708,0.346,150.277,7,4,241.107,0
3,,0.335,0.625,-7.462,1,0.0386,0.164,0.0,0.0708,0.346,150.277,7,4,241.107,0
4,All I've Ever Needed,0.433,0.31,-8.941,1,0.0305,0.88,0.00405,0.118,0.164,142.845,0,4,236.347,0


In [179]:
# Drop duplicates
sf_18_ft.drop_duplicates(inplace=True)

In [251]:
sf_18_ft.to_csv('./data/REVAMPED_2018_songfacts_features')

### 2018 master audio features

In [252]:
# Run process master function on 2018 billboard and songfacts audio features
master_18_ft = process_master(bb_18_ft, sf_18_ft)

In [253]:
master_18_ft.shape

(1468, 15)

In [307]:
master_18_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,God's Plan,0.754,0.449,-9.211,1,0.109,0.0332,8.3e-05,0.552,0.357,77.169,7,4,198.973,1
1,Perfect,0.599,0.448,-6.312,1,0.0232,0.163,0.0,0.106,0.168,95.05,8,3,263.4,1
2,Meant to Be (feat. Florida Georgia Line),0.643,0.783,-6.458,1,0.0856,0.047,0.0,0.083,0.579,154.084,10,4,163.87,1
3,Havana,0.765,0.523,-4.333,1,0.03,0.184,3.6e-05,0.132,0.394,104.988,2,4,217.307,1
4,rockstar (feat. 21 Savage),0.587,0.535,-6.09,0,0.0898,0.117,6.6e-05,0.131,0.14,159.847,5,4,218.147,1


In [255]:
# Create master csv file for EDA and modeling
master_18_ft.to_csv('./data/MASTER_2018_audio_features')

### 2013 BillBoard audio features

In [16]:
bb_13_ft = pd.read_csv('./data/2013_billboard_features')

In [17]:
# Run revamp function 
bb_13_ft = revamp(bb_13_ft)

In [256]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_13_ft['hit'] = [1 for i in range(bb_13_ft.shape[0])]

In [257]:
bb_13_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,Thrift Shop (feat. Wanz),0.781,0.526,-6.985,0,0.293,0.0619,0.0,0.0457,0.662,94.992,6,4,235.613,1
1,Blurred Lines,0.862,0.608,-4.762,1,0.0402,0.00373,6e-06,0.0856,0.836,120.002,7,4,263.827,1
2,Radioactive,0.448,0.784,-3.686,1,0.0627,0.106,0.000108,0.668,0.236,136.245,9,4,186.813,1
3,Harlem Shake,0.452,0.794,-5.151,1,0.0483,0.0111,0.00182,0.416,0.282,137.825,0,4,196.664,1
4,Can't Hold Us - feat. Ray Dalton,0.641,0.922,-4.457,1,0.0786,0.0291,0.0,0.0862,0.847,146.078,2,4,258.343,1


In [258]:
bb_13_ft.to_csv('./data/REVAMPED_2013_billboard_features')

In [333]:
# Check for unique values
bb_13_ft['title'].value_counts()

My Songs Know What You Did In The Dark (Light Em Up)         1
Going Bad (feat. Drake)                                      1
Locked Out Of Heaven                                         1
I Love It (& Lil Pump)                                       1
Try Me                                                       1
Let Me Love You                                              1
I Need Your Love (feat. Ellie Goulding)                      1
Still Into You                                               1
Safe And Sound                                               1
Sail                                                         1
Counting Stars                                               1
Wake Me Up                                                   1
Catch My Breath                                              1
Wanted You (feat. Lil Uzi Vert)                              1
Thrift Shop (feat. Wanz)                                     1
Before I Cry                                           

### 2013 SongFacts audio features

In [188]:
sf_13_ft = pd.read_csv('./data/2013_songfacts_features')

In [189]:
# Check initial shape of dataframe
sf_13_ft.shape

(1513, 19)

In [191]:
column_contents = []                                      # Instantiate empty list

for ID in sf_13_ft['id']:                                 # For loop to iterate through IDs in ID column
    try:
        temp = spotify.track(ID)['name']                  # Spotify API call for track name
    except:
        temp = ['Error']                                  # Except statement, print ['Error'] when errors occur
    column_contents.append(temp)                          # Append track name to empty list
    time.sleep(2)                                         # Wait 2 seconds before rerunning
    
songs = pd.DataFrame(column_contents, columns=['title'])

In [192]:
# Check shape of songs dataframe
songs.shape

(1513, 1)

In [195]:
# Drop duplicates
songs.drop_duplicates(inplace=True)

In [334]:
# Check for unique values
songs['title'].value_counts()

Life is Beautiful                         1
I´m a Lover Not a Fighter                 1
Is There Somebody Who Can Watch You       1
Love & Meth                               1
Instant Crush                             1
I Luv This S**t (Feat. $hamrock)          1
Damage                                    1
Knee Deep In My Heart                     1
Carolina                                  1
Daedalus (What We Have)                   1
Happy Now                                 1
BANG!                                     1
Fluffy                                    1
Hookah                                    1
I Can't Describe (The Way I Feel)         1
Downtown                                  1
Farrah Fawcett Hair                       1
Interlude: I'm Not Angry Anymore          1
Hello                                     1
I'd Want It To Be Yours                   1
Baby Come Back to Me                      1
broken                                    1
Come Together - Remastered 2009 

In [197]:
# Run revamp basic function on sf_13_ft and songs dataframes
sf_13_ft = revamp_basic(sf_13_ft, songs)

In [259]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_13_ft['hit'] = [0 for i in range(sf_13_ft.shape[0])]

In [320]:
sf_13_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,Beautiful (feat. Camila Cabello),0.638,0.717,-4.722,1,0.0337,0.346,0.0,0.105,0.249,100.027,2,4,180.0,0
1,All Hell Breaks Loose,0.441,0.937,-11.576,0,0.0717,0.0354,0.0,0.0924,0.332,156.942,4,4,106.96,0
2,All I Want for Christmas Is You,0.335,0.625,-7.462,1,0.0386,0.164,0.0,0.0708,0.346,150.277,7,4,241.107,0
3,,0.335,0.625,-7.462,1,0.0386,0.164,0.0,0.0708,0.346,150.277,7,4,241.107,0
4,All I've Ever Needed,0.433,0.31,-8.941,1,0.0305,0.88,0.00405,0.118,0.164,142.845,0,4,236.347,0


In [200]:
# Drop duplicates
sf_13_ft.drop_duplicates(inplace=True)

In [261]:
sf_13_ft.shape

(1501, 15)

In [324]:
sf_13_ft.to_csv('./data/REVAMPED_2013_songfacts_features')

### 2013 master audio features

In [263]:
# Run process master function on 2013 billboard and songfacts audio features
master_13_ft = process_master(bb_13_ft, sf_13_ft)

In [264]:
master_13_ft.shape

(1601, 15)

In [311]:
master_13_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,Thrift Shop (feat. Wanz),0.781,0.526,-6.985,0,0.293,0.0619,0.0,0.0457,0.662,94.992,6,4,235.613,1
1,Blurred Lines,0.862,0.608,-4.762,1,0.0402,0.00373,6e-06,0.0856,0.836,120.002,7,4,263.827,1
2,Radioactive,0.448,0.784,-3.686,1,0.0627,0.106,0.000108,0.668,0.236,136.245,9,4,186.813,1
3,Harlem Shake,0.452,0.794,-5.151,1,0.0483,0.0111,0.00182,0.416,0.282,137.825,0,4,196.664,1
4,Can't Hold Us - feat. Ray Dalton,0.641,0.922,-4.457,1,0.0786,0.0291,0.0,0.0862,0.847,146.078,2,4,258.343,1


In [315]:
# Create master csv file for EDA and modeling
master_13_ft.to_csv('./data/MASTER_2013_audio_features')

### 2008 BillBoard audio features

In [34]:
bb_08_ft = pd.read_csv('./data/2008_billboard_features')

In [35]:
bb_08_ft = revamp(bb_08_ft)

In [267]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_08_ft['hit'] = [1 for i in range(bb_08_ft.shape[0])]

In [268]:
bb_08_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,All Time Low,0.584,0.569,-4.259,1,0.166,0.0508,0.0,0.0924,0.501,89.779,0,4,217.603,1
1,Bleeding Love,0.638,0.656,-5.886,1,0.0357,0.188,0.0,0.146,0.225,104.036,5,4,262.467,1
2,No One Compares To You,0.703,0.748,-6.047,1,0.0435,0.123,0.0,0.0642,0.625,111.943,1,4,184.08,1
3,Lollipop,0.828,0.433,-9.716,1,0.199,0.0656,0.000876,0.122,0.44,148.073,0,4,299.333,1
4,Apologize,0.591,0.718,-6.025,1,0.0368,0.348,0.000118,0.107,0.468,117.995,8,4,208.107,1


In [269]:
bb_08_ft.to_csv('./data/REVAMPED_2008_billboard_features')

In [335]:
# Check for unique values
bb_08_ft['title'].value_counts()

Hot N Cold                                                                                  1
Sexy Can I feat. Yung Berg                                                                  1
Lollipop                                                                                    1
Crush A Lot                                                                                 1
Sensual Seduction                                                                           1
Hypnotized                                                                                  1
4 Minutes (feat. Justin Timberlake & Timbaland)                                             1
No One Compares To You                                                                      1
Bubbly                                                                                      1
Don't Know What You Got (Till It's Gone)                                                    1
Crank That (Soulja Boy)                                     

### 2008 SongFacts audio features

In [209]:
sf_08_ft = pd.read_csv('./data/2008_songfacts_features')

In [210]:
# Check initial shape of sf_08_ft dataframe
sf_08_ft.shape

(1548, 19)

In [218]:
# Drop duplicates
sf_08_ft.drop_duplicates(inplace=True)

In [219]:
sf_08_ft.shape

(1478, 19)

In [220]:
column_contents3 = []                                      # Instantiate empty list

for ID in sf_08_ft['id']:                                 # For loop to iterate through IDs in ID column
    try:
        temp = spotify.track(ID)['name']                  # Spotify API call for track name
    except:
        temp = ['Error']                                  # Except statement, print ['Error'] when errors occur
    column_contents3.append(temp)                          # Append track name to empty list
    time.sleep(2)                                         # Wait 2 seconds before rerunning
    
songs3 = pd.DataFrame(column_contents3, columns=['title'])

In [336]:
songs3['title'].value_counts()

Error    1478
Name: title, dtype: int64

In [224]:
sf_08_ft = revamp_basic(sf_08_ft, songs3)

In [270]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_08_ft['hit'] = [0 for i in range(sf_08_ft.shape[0])]

In [227]:
sf_08_ft.drop_duplicates(inplace=True)

In [271]:
sf_08_ft.shape

(1478, 15)

In [272]:
sf_08_ft.to_csv('./data/REVAMPED_2008_songfacts_features')

### 2008 master audio features

In [273]:
# Run function on 2008 billboard and songfacts audio features
master_08_ft = process_master(bb_08_ft, sf_08_ft)

In [274]:
master_08_ft.shape

(1577, 15)

In [309]:
master_08_ft.head()

Unnamed: 0,title,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,duration_sec,hit
0,All Time Low,0.584,0.569,-4.259,1.0,0.166,0.0508,0.0,0.0924,0.501,89.779,0.0,4.0,217.603,1
1,Bleeding Love,0.638,0.656,-5.886,1.0,0.0357,0.188,0.0,0.146,0.225,104.036,5.0,4.0,262.467,1
2,No One Compares To You,0.703,0.748,-6.047,1.0,0.0435,0.123,0.0,0.0642,0.625,111.943,1.0,4.0,184.08,1
3,Lollipop,0.828,0.433,-9.716,1.0,0.199,0.0656,0.000876,0.122,0.44,148.073,0.0,4.0,299.333,1
4,Apologize,0.591,0.718,-6.025,1.0,0.0368,0.348,0.000118,0.107,0.468,117.995,8.0,4.0,208.107,1


In [317]:
# Create master csv file for EDA and modeling
master_08_ft.to_csv('./data/MASTER_2008_audio_features')

### Clean, Process and Merge Track Lyrics DataFrames

In [290]:
# Create a function to process and clean lyrics dataframe
def process_master_lyr(df): 
    
    # Instantiate tokenizer with specific regular expression
    tokenizer = RegexpTokenizer(r'\w+')
    # Instantiate stemmer
    stemmer = PorterStemmer()

    # List to append stemmed words
    stemmed = []        
    # List to append tokenized words
    tokenized = []
    
    # Create a for loop to iterate through all the rows in specific column
    for i in df['lyrics']:                          
        
        # Converting lyrics text to tokens
        tokens = tokenizer.tokenize(i.lower()) 
        tokenized.append(tokens)

        # Stemming all tokens
        stems = [stemmer.stem(token) for token in tokens]  
        # Appending stems to stemmed list
        stemmed.append(stems)                                         
    
    # Creating new dataframe columns
    df['tokenized_lyrics'] = [' '.join(i) for i in tokenized]    
    df['stemmed_lyrics'] = [' '.join(i) for i in stemmed]
    
    # Drop unnecessary column
    df.drop('Unnamed: 0', axis=1, inplace=True)
    # Drop duplicates
    df.drop_duplicates(inplace=True)

### 2018 track lyrics

In [42]:
bb_18_lyr = pd.read_csv('./data/2018_billboard_lyrics')

In [280]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_18_lyr['hit'] = [1 for i in range(bb_18_lyr.shape[0])]

In [43]:
sf_18_lyr = pd.read_csv('./data/2018_songfacts_lyrics')

In [281]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_18_lyr['hit'] = [0 for i in range(sf_18_lyr.shape[0])]

In [282]:
# Concatenate dataframes
master_18_lyr = pd.concat([bb_18_lyr, sf_18_lyr])

In [286]:
# Run function on 2018 master lyrics dataframe
process_master_lyr(master_18_lyr)

In [287]:
master_18_lyr.head()

Unnamed: 0,lyrics,hit,tokenized_lyrics,stemmed_lyrics
0,Yeah they wishin and wishin and wishin and wis...,1,yeah they wishin and wishin and wishin and wis...,yeah they wishin and wishin and wishin and wis...
1,I found a love for me Oh darling just dive rig...,1,i found a love for me oh darling just dive rig...,i found a love for me oh darl just dive right ...
2,Baby lay on back and relax Kick your pretty fe...,1,baby lay on back and relax kick your pretty fe...,babi lay on back and relax kick your pretti fe...
3,Hey Havana ooh na na (ayy) Half of my heart i...,1,hey havana ooh na na ayy half of my heart is i...,hey havana ooh na na ayi half of my heart is i...
4,(Award to the Artist and to the Producer(s) Re...,1,award to the artist and to the producer s reco...,award to the artist and to the produc s record...


In [288]:
master_18_lyr.shape

(1471, 4)

In [289]:
# Create master csv file for EDA and modeling
master_18_lyr.to_csv('./data/MASTER_2018_lyrics')

### 2013 track lyrics

In [49]:
bb_13_lyr = pd.read_csv('./data/2013_billboard_lyrics')

In [291]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_13_lyr['hit'] = [1 for i in range(bb_13_lyr.shape[0])]

In [50]:
sf_13_lyr = pd.read_csv('./data/2013_songfacts_lyrics')

In [292]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_13_lyr['hit'] = [0 for i in range(sf_13_lyr.shape[0])]

In [293]:
# Concatenate dataframes
master_13_lyr = pd.concat([bb_13_lyr, sf_13_lyr])

In [294]:
# Run function on 2013 master lyrics dataframe
process_master_lyr(master_13_lyr)

In [295]:
master_13_lyr.head()

Unnamed: 0,lyrics,hit,tokenized_lyrics,stemmed_lyrics
0,Hey Macklemore can we go thrift shopping What ...,1,hey macklemore can we go thrift shopping what ...,hey macklemor can we go thrift shop what what ...
1,Everybody get up WOO! Hey hey hey Hey hey hey ...,1,everybody get up woo hey hey hey hey hey hey h...,everybodi get up woo hey hey hey hey hey hey h...
2,Whoah oh Whoah oh Whoah oh Whoah Im waking up...,1,whoah oh whoah oh whoah oh whoah im waking up ...,whoah oh whoah oh whoah oh whoah im wake up to...
3,Con los terroristas tas tas tas tas tas ...,1,con los terroristas tas tas tas tas tas tas ta...,con lo terrorista ta ta ta ta ta ta ta ta ta t...
4,Arent you somethin to admire Cause your shine ...,1,arent you somethin to admire cause your shine ...,arent you somethin to admir caus your shine is...


In [296]:
master_13_lyr.shape

(2791, 4)

In [297]:
# Create master csv file for EDA and modeling
master_13_lyr.to_csv('./data/MASTER_2013_lyrics')

### 2008 track lyrics

In [56]:
bb_08_lyr = pd.read_csv('./data/2008_billboard_lyrics')

In [298]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
bb_08_lyr['hit'] = [1 for i in range(bb_08_lyr.shape[0])]

In [57]:
sf_08_lyr = pd.read_csv('./data/2008_songfacts_lyrics')

In [299]:
'''
Creating hit column which will be y variable in modeling
hit = 1 = hit song
hit = 0 = non-hit song
'''
sf_08_lyr['hit'] = [0 for i in range(sf_08_lyr.shape[0])]

In [300]:
# Concatenate dataframes
master_08_lyr = pd.concat([bb_08_lyr, sf_08_lyr])

In [301]:
# Run function on 2008 master lyrics list
process_master_lyr(master_08_lyr)

In [302]:
master_08_lyr.head()

Unnamed: 0,lyrics,hit,tokenized_lyrics,stemmed_lyrics
0,Hmm mmm mmm mmm mmm mmm Let me talk to em let ...,1,hmm mmm mmm mmm mmm mmm let me talk to em let ...,hmm mmm mmm mmm mmm mmm let me talk to em let ...
1,Closed off from love I didnt need the pain Onc...,1,closed off from love i didnt need the pain onc...,close off from love i didnt need the pain onc ...
2,I just want you close Where you can stay forev...,1,i just want you close where you can stay forev...,i just want you close where you can stay forev...
3,Oww! Uh huh No homo Young Mula baby I said hes...,1,oww uh huh no homo young mula baby i said hes ...,oww uh huh no homo young mula babi i said he s...
4,Im holding on your rope got me ten feet off th...,1,im holding on your rope got me ten feet off th...,im hold on your rope got me ten feet off the g...


In [303]:
master_08_lyr.shape

(2033, 4)

In [304]:
# Create master csv file for EDA and modeling
master_08_lyr.to_csv('./data/MASTER_2008_lyrics')