### *Can we predict whether a song is going to be in the top 10 of the Billboard Hot 100?*
- https://data.world/kcmillersean/billboard-hot-100-1958-2017
- Based on song metrics gathered from Spotify
    - duration
    - key
    - mode (major vs minor)
    - time_signature
    - acousticness
        - confidence measure from 0-1
    - danceability
        - confidence measure from 0-1
        - based on tempo, rhythm stability, beat strength, regularity
    - energy
        - confidence measure from 0-1
        - measure of intensity and activity (fast, loud, noisy)
        - dynamic range, loudness, timbre, onset rate, entropy
    - instrumentalness
        - confidence measure from 0-1
        - measure of whether song has vocals or not
    - liveness
    - loudness
    - speechiness
    - valence
    - tempo
    - acceleration to top 10
    - **genre**
    - **most common word**
    - **label**
    - **date**
        

In [167]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import re
import spotipy

In [168]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.precision', 3)

In [169]:
# chart_df = pd.read_csv('/Users/andrewgraves/Data_Science/Metis/Bootcamp/project_3/Music Data/Hot_Stuff.csv')
# chart_df

# del chart_df['url']
# del chart_df['Unnamed: 1']

# with open('chart_df.pkl', 'wb') as to_write:
#     pickle.dump(chart_df, to_write)

In [208]:
with open('chart_df.pkl','rb') as read_file:
    chart_df = pickle.load(read_file)

In [209]:
# pd.to_datetime will convert all dates to 21st century (e.g. 2058-2099),
# so date needs to be manually edited into four-year format.

old_date_list = list(chart_df["WeekID"])
new_date_list = []

for date in old_date_list:
    week_date = date.split("/")
    
    if int(week_date[2]) < 50:
        week_date[2] = "20" + week_date[2]
    else:
        week_date[2] = "19" + week_date[2]
    
    new_date_list.append( "/".join(week_date) )

new_dates = pd.to_datetime(new_date_list, format='%m/%d/%Y')
chart_df['WeekID'] = new_dates

In [172]:
new_dates_series = pd.Series(new_dates)
chart_df['Year'] = new_dates_series.dt.year
chart_df['Month'] = new_dates_series.dt.strftime("%B")

In [173]:
# Create season column:

def season_helper(month):
    if month in ['December', 'January', 'February']:
        return 'Winter'
    if month in ['March', 'April', 'May']:
        return 'Spring'
    if month in ['June', 'July', 'August']:
        return 'Summer'
    else:
        return 'Fall'

In [174]:
chart_df['Season'] = chart_df['Month'].apply(season_helper)

In [175]:
# Create decade column:

def decade_helper(year):
    if year in [1958, 1959]:
        return "50's"
    if year in [1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969]:
        return "60's"
    if year in [1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979]:
        return "70's"
    if year in [1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989]:
        return "80's"
    if year in [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999]:
        return "90's"
    if year in [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]:
        return "00's"
    else:
        return "10's"

In [176]:
chart_df['Decade'] = chart_df['Year'].apply(decade_helper)

In [210]:
chart_df

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,1,,2,1
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,1,,4,1
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1
...,...,...,...,...,...,...,...,...,...
317790,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,27,17
317791,2019-06-22,97,Baila Baila Baila,Ozuna x Daddy Yankee x J Balvin x Farruko x An...,Baila Baila BailaOzuna x Daddy Yankee x J Balv...,1,89.0,69,7
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,98,1
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,99,1


In [178]:
# Only NaN values are in "Previous Week Position".  These values are 
# intentional for songs that are entering the Hot 100 chart for the 
# first time.  This number is (317795 - 287367 =) 30428.

chart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317795 entries, 0 to 317794
Data columns (total 13 columns):
WeekID                    317795 non-null datetime64[ns]
Week Position             317795 non-null int64
Song                      317795 non-null object
Performer                 317795 non-null object
SongID                    317795 non-null object
Instance                  317795 non-null int64
Previous Week Position    287367 non-null float64
Peak Position             317795 non-null int64
Weeks on Chart            317795 non-null int64
Year                      317795 non-null int64
Month                     317795 non-null object
Season                    317795 non-null object
Decade                    317795 non-null object
dtypes: datetime64[ns](1), float64(1), int64(5), object(6)
memory usage: 31.5+ MB


In [36]:
# For a dataset with 317795 entries, there are only 9553 performers and
# 23410 songs.

print('Number of Performers: ', chart_df['Performer'].nunique())
print('Number of Songs: ', chart_df['Song'].nunique())

Number of Performers:  9553
Number of Songs:  23410


In [37]:
# Given that there are 5000 more song ID's than songs, 
# there must be a number of songs with the same name. 

# This number of song ID's is also 2000 smaller than the number of 
# NaN values in the 'Previous Week Position' value above. This must 
# mean that some songs dropped from the chart initially, but then 
# returned later on.

chart_df['SongID'].nunique()

28193

## Access Spotify Audio Features

- In order to access audio features for each song, the Spotify track ID needs to be obtained.
- Get set of unique song ID's (so least number of rows are looped through), and obtain each track ID from Spotify by providing song and performer for each song ID.

In [270]:
unique_songIDs = chart_df.drop_duplicates(subset=['SongID'])

In [271]:
cleaned_performers2 = []

# for index, value in unique_songIDs['Performer'][:17187].iteritems():
#     if 'Featuring' in value:
#         cleaned_performers3.append(value.split(' Featuring')[0])
#     elif 'featuring' in value:
#         cleaned_performers3.append(value.split(' featuring')[0])
#     else:
#         cleaned_performers3.append(value)

for index, value in unique_songIDs['Performer'].iteritems():
    if 'Featuring' in value:
        cleaned_performers2.append(value.split(' Featuring')[0])
    elif 'featuring' in value:
        cleaned_performers2.append(value.split(' featuring')[0])
    elif '&' in value:
        cleaned_performers2.append(value.split(' &')[0])
    else:
        cleaned_performers2.append(value)
        
len(cleaned_performers2)

28193

In [272]:
unique_songIDs['Performer'] = pd.Series(cleaned_performers2, index = unique_songIDs.index)
unique_songIDs[0:100]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Month,Year,Season,Decade
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1,August,1958,Summer,50's
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,1,,2,1,August,1958,Summer,50's
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1,August,1958,Summer,50's
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,1,,4,1,August,1958,Summer,50's
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1,August,1958,Summer,50's
5,1958-08-02,6,Rebel-'rouser,Duane Eddy His Twangy Guitar And The Rebels,Rebel-'rouserDuane Eddy His Twangy Guitar And ...,1,,6,1,August,1958,Summer,50's
6,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7,1,August,1958,Summer,50's
7,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,8,1,August,1958,Summer,50's
8,1958-08-02,9,Willie And The Hand Jive,The Johnny Otis Show,Willie And The Hand JiveThe Johnny Otis Show,1,,9,1,August,1958,Summer,50's
9,1958-08-02,10,Fever,Peggy Lee,FeverPeggy Lee,1,,10,1,August,1958,Summer,50's


In [25]:
from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(client_id = 'b8c2b532f4ad4c488e6546e73d302cb5', client_secret = '1088a3274c8a4859955bb8a604c38bbc')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [26]:
track_id_list2 = []
artist_track_id_list2 = []

for index, rows in unique_songIDs.iterrows():
    artist = rows['Performer']
    track = rows['Song']
    print(artist)
#     print(track)
    track_info = sp.search(q='artist:' + artist + ' track:' + track, type='track')
    for key1, value1 in track_info.items():
        artist_track_id_list2 = []
        for key2, value2 in value1.items():
            if key2 == 'items':
                for i in value2:
                    for key3, value3 in i.items():
                        if key3 == 'id':
                            artist_track_id_list2.append(value3)
                if len(artist_track_id_list2) > 0:  
                    track_id_list2.append(artist_track_id_list2[0])
                else:
                    track_id_list2.append(np.nan)

Ricky Nelson
Perez Prado And His Orchestra
Bobby Darin
Elvis Presley With The Jordanaires
Kalin Twins
Duane Eddy His Twangy Guitar And The Rebels
The Coasters
Jack Scott
The Johnny Otis Show
Peggy Lee
Frankie Avalon
Jimmy Clanton And His Rockets
Patti Page
Pat Boone
Jerry Butler and The Impressions
The Danleers
Jody Reynolds
The Elegants
Doris Day
Bobby Freeman
Johnny Cash And The Tennessee Two
Johnny Mathis
The Olympics
Sheb Wooley
Chuck Willis
Poni-Tails
The Crickets
Jimmie Rodgers
The Four Lads
Dean Martin
Big Bopper
Don Gibson
Tony And Joe
Elvis Presley With The Jordanaires
Bobby Day
Gerry Granahan
Perry Como
Buddy Knox with the Rhythm Orchids
Pat Boone
Bobby Hamilton
Buddy Holly
The Slades
Clyde McPhatter
Jan
The Diamonds
Gino
Dion
Dean Martin
The Everly Brothers
Jimmy Bowen with the Rhythm Orchids
The Kirby Stone Four
The Rinky-Dinks
Nat King Cole
Domenico Modugno
The Three G's
Jack Scott
Eydie Gorme
The Crickets
Tony Bennett
Bobby Day
Bobby Hendricks
Ed Townsend
Jerry Lee Lewis 

In [470]:
track_id_list.count(np.nan)

6739

In [473]:
len(track_id_list)

28193

In [27]:
track_id_list2.count(np.nan)

6385

In [28]:
len(track_id_list2)

28193

In [526]:
track_id_list3.count(np.nan)

6548

In [273]:
unique_songIDs.insert(5, column = "TrackID", value = track_id_list2)

In [274]:
# with open('unique_songIDs2.pkl', 'wb') as to_write:
#     pickle.dump(unique_songIDs, to_write)

In [197]:
with open('unique_songIDs2.pkl', 'rb') as read_file:
    unique_songIDs = pickle.load(read_file)

In [33]:
test_features = sp.audio_features(unique_songIDs['TrackID'][2])

In [39]:
# for index, value in unique_songIDs['Track_ID'][:10].iteritems():
#     try:
#         track_features = sp.audio_features(value)
#         track_features_dict = track_features[0]
#         for k, v in track_features_dict.items():
#             print(k)
#             track_features_dict[k] = audio_feature_dict.get(k,v)
#     except:
#         audio_feature_dict.update(dict.fromkeys['danceability',     \
#                                         'energy', 'key',     \
#                                         'loudness', 'mode',     \
#                                         'speechiness',     \
#                                         'acousticness',     \
#                                         'instrumentalness',     \
#                                         'liveness', 'valence',     \
#                                         'tempo', 'type',     \
#                                         'duration_ms',     \
#                                         'time_signature'], np.nan)

In [41]:
danceability = []
energy = [] 
key = [] 
loudness = [] 
mode = [] 
speechiness = [] 
acousticness = [] 
instrumentalness = [] 
liveness = [] 
valence = [] 
tempo = [] 
type_ = [] 
duration_ms = [] 
time_signature = []

for index, value in unique_songIDs['TrackID'].iteritems():
    print(index)
    try:
        track_features = sp.audio_features(value)
        track_features_dict = track_features[0]
        for k, v in track_features_dict.items():
            if k == 'danceability':
                danceability.append(v)
            elif k == 'energy':
                energy.append(v)
            elif k == 'key':
                key.append(v)
            elif k == 'loudness':
                loudness.append(v)
            elif k == 'mode':
                mode.append(v)
            elif k == 'spechiness':
                speechiness.append(v)
            elif k == 'acousticness':
                acousticness.append(v)
            elif k == 'instrumentalness':
                instrumentalness.append(v)
            elif k == 'liveness':
                liveness.append(v)
            elif k == 'valence':
                valence.append(v)
            elif k == 'tempo':
                tempo.append(v)
            elif k == 'type':
                type_.append(v)
            elif k == 'duration_ms':
                duration_ms.append(v)
            elif k == 'time_signature':
                time_signature.append(v)
    except:
            danceability.append(np.nan)
            energy.append(np.nan)
            key.append(np.nan)
            loudness.append(np.nan)
            mode.append(np.nan)
            speechiness.append(np.nan)
            acousticness.append(np.nan)
            instrumentalness.append(np.nan)
            liveness.append(np.nan)
            valence.append(np.nan)
            tempo.append(np.nan)
            type_.append(np.nan)
            duration_ms.append(np.nan)
            time_signature.append(np.nan)            

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
141
142
150
157
166
172
175
177
180
183
184
185
186
187
191
193
194
195
196
198
251
259
266
270
273
275
280
287
288
289
290
293
294
295
296
297
298
299
356
361
364
371
373
377
379
380
381
390
393
394
396
399
473
478
479
484
492
493
494
495
496
497
498
499
571
584
587
591
592
593
594
595
658
660
664
669
671
672
675
683
686
691
697
753
755
759
768
773
775
776
780
783
784
787
791
792
795
796
797
799
858
859
865
871
873
878
882
886
888
890
891
895
899
968
972
986
987
989
991
992
994
995
996
997
998
999
1069
1081
1084
1085
1086
1088
1093
1095
1096
1099
1150
1173
1175
1185
1189
1194
1195
1197
1266
1271
1281
1287
1288
1293
1294
1296
1353
1364
1376
1385
1387
1391
1397
1399
1429
1450
1466
1478
1479
1480
1491
1

In [320]:
# speechiness was printed only 25% of the time, as it mostly has to 
# do with spoken word/podcasts.  It will deleted for now.

audio_feature_dict = {'Danceability': danceability, 'Energy': energy, 'Key': key,    \
                      'Loudness': loudness, 'Mode': mode,    \
                      'Acousticness': acousticness, 'Instrumentalness': instrumentalness,     \
                      'Liveness': liveness, 'Valence': valence, 'Tempo': tempo,    \
                      'Duration_ms': duration_ms, 'Time_Signature': time_signature}


In [321]:
len(time_signature)

28193

In [322]:
audio_features_df = pd.DataFrame(audio_feature_dict, index = unique_songIDs.index)
audio_features_df

Unnamed: 0,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Time_Signature
0,0.543,0.332,0.0,-11.573,1.0,0.669,0.000e+00,0.134,0.795,154.999,153933.0,4.0
1,,,,,,,,,,,,
2,0.645,0.943,0.0,-1.526,1.0,0.385,0.000e+00,0.370,0.965,147.768,131720.0,4.0
3,,,,,,,,,,,,
4,0.646,0.582,6.0,-12.679,1.0,0.168,4.550e-06,0.976,0.963,96.490,144573.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
317786,0.442,0.445,10.0,-4.071,0.0,0.653,0.000e+00,0.118,0.423,85.744,179937.0,4.0
317787,0.795,0.496,0.0,-5.883,1.0,0.108,0.000e+00,0.065,0.874,94.010,183000.0,4.0
317792,0.551,0.609,7.0,-8.975,1.0,0.368,1.540e-04,0.192,0.324,139.084,104013.0,4.0
317793,0.801,0.774,1.0,-5.125,1.0,0.036,3.030e-06,0.114,0.321,144.988,150760.0,4.0


In [196]:
audio_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28193 entries, 0 to 317794
Data columns (total 12 columns):
Danceability        21786 non-null float64
Energy              21786 non-null float64
Key                 21786 non-null object
Loudness            21786 non-null float64
Mode                21786 non-null object
Acousticness        21786 non-null float64
Instrumentalness    21786 non-null float64
Liveness            21786 non-null float64
Valence             21786 non-null float64
Tempo               21786 non-null float64
Duration_min        21786 non-null float64
Time_Signature      21786 non-null object
dtypes: float64(9), object(3)
memory usage: 2.8+ MB


**Update audio features so they can be analyzed (e.g. convert key, mode, and time signature to categories, and convert duration to minutes)**

In [190]:
audio_features_df['Mode'].replace(1.0, 'Major', inplace=True)
audio_features_df['Mode'].replace(0.0, 'Minor', inplace=True)
audio_features_df['Key'].replace(0.0, 'C', inplace=True)
audio_features_df['Key'].replace(1.0, 'C#', inplace=True)
audio_features_df['Key'].replace(2.0, 'D', inplace=True)
audio_features_df['Key'].replace(3.0, 'D#', inplace=True)
audio_features_df['Key'].replace(4.0, 'E', inplace=True)
audio_features_df['Key'].replace(5.0, 'F', inplace=True)
audio_features_df['Key'].replace(6.0, 'F#', inplace=True)
audio_features_df['Key'].replace(7.0, 'G', inplace=True)
audio_features_df['Key'].replace(8.0, 'G#', inplace=True)
audio_features_df['Key'].replace(9.0, 'A', inplace=True)
audio_features_df['Key'].replace(10.0, 'Bb', inplace=True)
audio_features_df['Key'].replace(11.0, 'B', inplace=True)
audio_features_df['Time_Signature'].replace(1.0, 4.0, inplace=True)
audio_features_df['Time_Signature'].replace(0.0, 4.0, inplace=True)
audio_features_df['Time_Signature'].replace(3.0, '3/4', inplace=True)
audio_features_df['Time_Signature'].replace(4.0, '4/4', inplace=True)
audio_features_df['Time_Signature'].replace(5.0, '5/4', inplace=True)
audio_features_df['Duration_ms'] = audio_features_df['Duration_ms']/60000
audio_features_df.rename(columns={'Duration_ms':'Duration_min'}, inplace=True)

In [193]:
audio_features_df

Unnamed: 0,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
1,,,,,,,,,,,,
2,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
3,,,,,,,,,,,,
4,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
...,...,...,...,...,...,...,...,...,...,...,...,...
317786,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4
317787,0.795,0.496,C,-5.883,Major,0.108,0.000e+00,0.065,0.874,94.010,3.050,4/4
317792,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [194]:
# with open('audio_features2.pkl', 'wb') as to_write:
#     pickle.dump(audio_features_df, to_write)

In [195]:
with open('audio_features2.pkl','rb') as read_file:
    audio_features_df = pickle.load(read_file)

In [324]:
unique_songIDs

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,TrackID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Month,Year,Season,Decade
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,5ayybTSXNwcarDtxQKqvWX,1,,1,1,August,1958,Summer,50's
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,,1,,2,1,August,1958,Summer,50's
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,40fD7ct05FvQHLdQTgJelG,1,,3,1,August,1958,Summer,50's
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,,1,,4,1,August,1958,Summer,50's
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1BQOMZeYwzesvXANmRgrEp,1,,5,1,August,1958,Summer,50's
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317786,2019-06-22,92,Amor Genuino,Ozuna,Amor GenuinoOzuna,6pK5edDoyrn4mWwRSVc2a5,1,,92,1,June,2019,Summer,10's
317787,2019-06-22,93,Only Human,Jonas Brothers,Only HumanJonas Brothers,0DiDStADDVh3SvAsoJAFMk,1,,93,1,June,2019,Summer,10's
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,3tZ4zv4UuenjJGSNUMfBNG,1,,98,1,June,2019,Summer,10's
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,2QEnZJ8FwpBoyLKcFcgceT,1,,99,1,June,2019,Summer,10's


In [198]:
unique_songIDs_full = unique_songIDs.join(audio_features_df)
unique_songIDs_full

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,TrackID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Month,Year,Season,Decade,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,5ayybTSXNwcarDtxQKqvWX,1,,1,1,August,1958,Summer,50's,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,,1,,2,1,August,1958,Summer,50's,,,,,,,,,,,,
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,40fD7ct05FvQHLdQTgJelG,1,,3,1,August,1958,Summer,50's,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,,1,,4,1,August,1958,Summer,50's,,,,,,,,,,,,
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1BQOMZeYwzesvXANmRgrEp,1,,5,1,August,1958,Summer,50's,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317786,2019-06-22,92,Amor Genuino,Ozuna,Amor GenuinoOzuna,6pK5edDoyrn4mWwRSVc2a5,1,,92,1,June,2019,Summer,10's,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4
317787,2019-06-22,93,Only Human,Jonas Brothers,Only HumanJonas Brothers,0DiDStADDVh3SvAsoJAFMk,1,,93,1,June,2019,Summer,10's,0.795,0.496,C,-5.883,Major,0.108,0.000e+00,0.065,0.874,94.010,3.050,4/4
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,3tZ4zv4UuenjJGSNUMfBNG,1,,98,1,June,2019,Summer,10's,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,2QEnZJ8FwpBoyLKcFcgceT,1,,99,1,June,2019,Summer,10's,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [199]:
# with open('unique_songIDs_full2.pkl', 'wb') as to_write:
#     pickle.dump(unique_songIDs_full, to_write)

In [200]:
with open('unique_songIDs_full2.pkl', 'rb') as read_file:
    unique_songIDs_full = pickle.load(read_file)

In [201]:
unique_songIDs_full

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,TrackID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Month,Year,Season,Decade,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,5ayybTSXNwcarDtxQKqvWX,1,,1,1,August,1958,Summer,50's,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,,1,,2,1,August,1958,Summer,50's,,,,,,,,,,,,
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,40fD7ct05FvQHLdQTgJelG,1,,3,1,August,1958,Summer,50's,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,,1,,4,1,August,1958,Summer,50's,,,,,,,,,,,,
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1BQOMZeYwzesvXANmRgrEp,1,,5,1,August,1958,Summer,50's,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317786,2019-06-22,92,Amor Genuino,Ozuna,Amor GenuinoOzuna,6pK5edDoyrn4mWwRSVc2a5,1,,92,1,June,2019,Summer,10's,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4
317787,2019-06-22,93,Only Human,Jonas Brothers,Only HumanJonas Brothers,0DiDStADDVh3SvAsoJAFMk,1,,93,1,June,2019,Summer,10's,0.795,0.496,C,-5.883,Major,0.108,0.000e+00,0.065,0.874,94.010,3.050,4/4
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,3tZ4zv4UuenjJGSNUMfBNG,1,,98,1,June,2019,Summer,10's,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,2QEnZJ8FwpBoyLKcFcgceT,1,,99,1,June,2019,Summer,10's,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [202]:
unique_songIDs_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28193 entries, 0 to 317794
Data columns (total 26 columns):
WeekID                    28193 non-null datetime64[ns]
Week Position             28193 non-null int64
Song                      28193 non-null object
Performer                 28193 non-null object
SongID                    28193 non-null object
TrackID                   21808 non-null object
Instance                  28193 non-null int64
Previous Week Position    13 non-null float64
Peak Position             28193 non-null int64
Weeks on Chart            28193 non-null int64
Month                     28193 non-null object
Year                      28193 non-null int64
Season                    28193 non-null object
Decade                    28193 non-null object
Danceability              21786 non-null float64
Energy                    21786 non-null float64
Key                       21786 non-null object
Loudness                  21786 non-null float64
Mode                  

In [203]:
unique_songIDs_full.dropna(thresh=15, inplace=True);

In [204]:
unique_songIDs_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21786 entries, 0 to 317794
Data columns (total 26 columns):
WeekID                    21786 non-null datetime64[ns]
Week Position             21786 non-null int64
Song                      21786 non-null object
Performer                 21786 non-null object
SongID                    21786 non-null object
TrackID                   21786 non-null object
Instance                  21786 non-null int64
Previous Week Position    9 non-null float64
Peak Position             21786 non-null int64
Weeks on Chart            21786 non-null int64
Month                     21786 non-null object
Year                      21786 non-null int64
Season                    21786 non-null object
Decade                    21786 non-null object
Danceability              21786 non-null float64
Energy                    21786 non-null float64
Key                       21786 non-null object
Loudness                  21786 non-null float64
Mode                   

In [205]:
# Delete these columns to allow for merge with main chart_df

del unique_songIDs_full['WeekID']
del unique_songIDs_full['Week Position']
del unique_songIDs_full['Song']
del unique_songIDs_full['Performer']
del unique_songIDs_full['Instance']
del unique_songIDs_full['Previous Week Position']
del unique_songIDs_full['Peak Position']
del unique_songIDs_full['Weeks on Chart']
del unique_songIDs_full['Month']
del unique_songIDs_full['Year']
del unique_songIDs_full['Season']
del unique_songIDs_full['Decade']


In [206]:
unique_songIDs_full

Unnamed: 0,SongID,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,Poor Little FoolRicky Nelson,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
2,Splish SplashBobby Darin,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
4,WhenKalin Twins,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
6,Yakety YakThe Coasters,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4
7,My True LoveJack Scott,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317786,Amor GenuinoOzuna,6pK5edDoyrn4mWwRSVc2a5,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4
317787,Only HumanJonas Brothers,0DiDStADDVh3SvAsoJAFMk,0.795,0.496,C,-5.883,Major,0.108,0.000e+00,0.065,0.874,94.010,3.050,4/4
317792,XanaX DamageFuture,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,Government OfficialFuture,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [211]:
chart_df

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,1,,2,1
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,1,,4,1
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1
...,...,...,...,...,...,...,...,...,...
317790,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,27,17
317791,2019-06-22,97,Baila Baila Baila,Ozuna x Daddy Yankee x J Balvin x Farruko x An...,Baila Baila BailaOzuna x Daddy Yankee x J Balv...,1,89.0,69,7
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,98,1
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,99,1


In [212]:
# Merge audio features with complete chart_df

chart_total_df = chart_df.merge(unique_songIDs_full, how='left', on='SongID')

In [213]:
chart_total_df

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
1,1958-08-02,2,Patricia,Perez Prado And His Orchestra,PatriciaPerez Prado And His Orchestra,1,,2,1,,,,,,,,,,,,,
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
3,1958-08-02,4,Hard Headed Woman,Elvis Presley With The Jordanaires,Hard Headed WomanElvis Presley With The Jordan...,1,,4,1,,,,,,,,,,,,,
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317790,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,27,17,6Hj9jySrnFppAI0sEMCZpJ,0.685,0.692,D,-5.122,Major,0.328,0.000e+00,0.153,0.578,159.966,4.001,5/4
317791,2019-06-22,97,Baila Baila Baila,Ozuna x Daddy Yankee x J Balvin x Farruko x An...,Baila Baila BailaOzuna x Daddy Yankee x J Balv...,1,89.0,69,7,,,,,,,,,,,,,
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,98,1,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,99,1,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [214]:
chart_total_df.dropna(thresh=15, inplace=True);

In [215]:
chart_total_df

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
6,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7,1,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4
7,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,8,1,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317789,2019-06-22,95,La La Land,Bryce Vine Featuring YG,La La LandBryce Vine Featuring YG,1,92.0,92,2,4oM9DGUTrRaXJDypxxHztZ,0.774,0.473,C#,-5.122,Minor,0.221,0.000e+00,0.222,0.416,97.953,2.954,4/4
317790,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,27,17,6Hj9jySrnFppAI0sEMCZpJ,0.685,0.692,D,-5.122,Major,0.328,0.000e+00,0.153,0.578,159.966,4.001,5/4
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,98,1,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,99,1,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [216]:
chart_total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258797 entries, 0 to 317794
Data columns (total 22 columns):
WeekID                    258797 non-null datetime64[ns]
Week Position             258797 non-null int64
Song                      258797 non-null object
Performer                 258797 non-null object
SongID                    258797 non-null object
Instance                  258797 non-null int64
Previous Week Position    235155 non-null float64
Peak Position             258797 non-null int64
Weeks on Chart            258797 non-null int64
TrackID                   258797 non-null object
Danceability              258797 non-null float64
Energy                    258797 non-null float64
Key                       258797 non-null object
Loudness                  258797 non-null float64
Mode                      258797 non-null object
Acousticness              258797 non-null float64
Instrumentalness          258797 non-null float64
Liveness                  258797 non-null floa

In [217]:
# with open('chart_total_df2.pkl', 'wb') as to_write:
#     pickle.dump(chart_total_df, to_write)

In [218]:
with open('chart_total_df2.pkl', 'rb') as read_file:
    chart_total_df = pickle.load(read_file)

In [219]:
columns_list = []

for name in chart_total_df.columns:
    columns_list.append(name.replace(" ", "_"))
    
chart_total_df.columns = columns_list  

In [220]:
chart_total_df

Unnamed: 0,WeekID,Week_Position,Song,Performer,SongID,Instance,Previous_Week_Position,Peak_Position,Weeks_on_Chart,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,1,1,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
2,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,3,1,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
4,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,5,1,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
6,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7,1,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4
7,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,8,1,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317789,2019-06-22,95,La La Land,Bryce Vine Featuring YG,La La LandBryce Vine Featuring YG,1,92.0,92,2,4oM9DGUTrRaXJDypxxHztZ,0.774,0.473,C#,-5.122,Minor,0.221,0.000e+00,0.222,0.416,97.953,2.954,4/4
317790,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,27,17,6Hj9jySrnFppAI0sEMCZpJ,0.685,0.692,D,-5.122,Major,0.328,0.000e+00,0.153,0.578,159.966,4.001,5/4
317792,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,98,1,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4
317793,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,99,1,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4


In [47]:
chart_df[chart_df['Song'] == "Deacon Blues"]

Unnamed: 0,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Year,Month,Season,Decade
102685,1978-04-01,86,Deacon Blues,0,Deacon Blues0,1,,86,1,1978,April,Spring,70's
102773,1978-04-08,74,Deacon Blues,0,Deacon Blues0,1,86.0,74,2,1978,April,Spring,70's
102862,1978-04-15,63,Deacon Blues,0,Deacon Blues0,1,74.0,63,3,1978,April,Spring,70's
102952,1978-04-22,53,Deacon Blues,0,Deacon Blues0,1,63.0,53,4,1978,April,Spring,70's
103040,1978-04-29,41,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,53.0,41,5,1978,April,Spring,70's
103137,1978-05-06,38,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,41.0,38,6,1978,May,Spring,70's
103232,1978-05-13,33,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,38.0,33,7,1978,May,Spring,70's
103327,1978-05-20,28,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,33.0,28,8,1978,May,Spring,70's
103422,1978-05-27,23,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,28.0,23,9,1978,May,Spring,70's
103520,1978-06-03,21,Deacon Blues,Steely Dan,Deacon BluesSteely Dan,1,23.0,21,10,1978,June,Summer,70's


In [221]:
# Some typos caused these songs to have debut weeks other than 1.  Fix
# the typos so all debut weeks are 1.

chart_total_df.replace('Eastsidebenny blanco, Halsey & Khalid', 'EastsideBenny Blanco, Halsey & Khalid', inplace=True)
chart_total_df['Performer'].replace('0', 'Steely Dan', inplace=True)
chart_total_df.replace('Deacon Blues0', 'Deacon BluesSteely Dan', inplace=True)
chart_total_df.replace('XXXTentacion', 'XXXTENTACION', inplace=True)
chart_total_df.replace('Everybody Dies In Their NightmaresXXXTentacion', 'Everybody Dies In Their NightmaresXXXTENTACION', inplace=True)
chart_total_df.replace('Jocelyn FloresXXXTentacion', 'Jocelyn FloresXXXTENTACION', inplace=True)
chart_total_df.replace('blackbear', 'Blackbear', inplace=True)
chart_total_df.replace('Do Re Miblackbear', 'Do Re MiBlackbear', inplace=True)
chart_total_df.replace('Plain JaneA$AP Ferg Featuring Nicki Minaj', 'Plain JaneA$AP Ferg', inplace=True)
chart_total_df.replace('The Way Life GoesLil Uzi Vert Featuring Nicki Minaj', 'The Way Life GoesLil Uzi Vert', inplace=True)
chart_total_df.replace('Here Comes Santa Claus (Right Down Santa Claus Lane)Gene Autry', 'Here Comes Santa Claus (Down Santa Claus Lane)Gene Autry', inplace=True)



### Find the following features:
- Debut week and position for each song
- Peak week and position for each song
- Corresponding decades and seasons for each of these points in time
- Time it takes for top 10 songs to reach top 10
- Number of weeks that a top 10 song remains in the top 10

In [222]:
chart_debut_df = chart_total_df.groupby(['SongID']).agg({'Weeks_on_Chart': 'first', 'WeekID': 'first', 'Peak_Position': 'first'}).reset_index()
del chart_debut_df['Weeks_on_Chart'] # Unnecessary since it's 1 for all rows
chart_debut_df.columns = ['SongID', 'Debut_Week', 'Debut_Position']
chart_debut_df

Unnamed: 0,SongID,Debut_Week,Debut_Position
0,"""B"" GirlsYoung And Restless",1990-05-05,84
1,"""Cherry Cherry"" from Hot August NightNeil Diamond",1973-03-17,84
2,#1 Dee JayGoody Goody,1978-11-18,88
3,#1Nelly,2001-10-20,75
4,#9 DreamJohn Lennon,1974-12-21,68
...,...,...,...
21774,teachmeMusiq Soulchild,2007-05-26,92
21775,whoa (mind in awe)XXXTENTACION,2018-12-22,37
21776,whoknowsMusiq,2004-04-24,75
21777,www.memoryAlan Jackson,2000-11-04,75


In [223]:
chart_peak_df = chart_total_df.groupby(['SongID']).apply(lambda x: x.loc[x.Peak_Position.idxmin(),['Peak_Position', 'WeekID',]])

In [224]:
chart_peak_df.columns = ['Peak_Position', 'Week_at_Peak']
chart_peak_df

Unnamed: 0_level_0,Peak_Position,Week_at_Peak
SongID,Unnamed: 1_level_1,Unnamed: 2_level_1
"""B"" GirlsYoung And Restless",54,1990-06-16
"""Cherry Cherry"" from Hot August NightNeil Diamond",31,1973-05-05
#1 Dee JayGoody Goody,82,1978-12-02
#1Nelly,22,2001-12-15
#9 DreamJohn Lennon,9,1975-02-22
...,...,...
teachmeMusiq Soulchild,42,2007-08-18
whoa (mind in awe)XXXTENTACION,37,2018-12-22
whoknowsMusiq,65,2004-05-08
www.memoryAlan Jackson,45,2000-12-23


In [225]:
chart_streak_df = chart_total_df.groupby(['SongID']).agg({'Weeks_on_Chart': 'max'}).reset_index()
chart_streak_df

Unnamed: 0,SongID,Weeks_on_Chart
0,"""B"" GirlsYoung And Restless",15
1,"""Cherry Cherry"" from Hot August NightNeil Diamond",10
2,#1 Dee JayGoody Goody,5
3,#1Nelly,20
4,#9 DreamJohn Lennon,12
...,...,...
21774,teachmeMusiq Soulchild,20
21775,whoa (mind in awe)XXXTENTACION,3
21776,whoknowsMusiq,13
21777,www.memoryAlan Jackson,15


In [226]:
chart_new_cols_df = chart_debut_df.merge(chart_peak_df, on='SongID')
chart_new_cols_df = chart_new_cols_df.merge(chart_streak_df, on='SongID')

In [227]:
chart_new_cols_df

Unnamed: 0,SongID,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart
0,"""B"" GirlsYoung And Restless",1990-05-05,84,54,1990-06-16,15
1,"""Cherry Cherry"" from Hot August NightNeil Diamond",1973-03-17,84,31,1973-05-05,10
2,#1 Dee JayGoody Goody,1978-11-18,88,82,1978-12-02,5
3,#1Nelly,2001-10-20,75,22,2001-12-15,20
4,#9 DreamJohn Lennon,1974-12-21,68,9,1975-02-22,12
...,...,...,...,...,...,...
21774,teachmeMusiq Soulchild,2007-05-26,92,42,2007-08-18,20
21775,whoa (mind in awe)XXXTENTACION,2018-12-22,37,37,2018-12-22,3
21776,whoknowsMusiq,2004-04-24,75,65,2004-05-08,13
21777,www.memoryAlan Jackson,2000-11-04,75,45,2000-12-23,15


In [228]:
debut_dates = chart_new_cols_df['Debut_Week']
peak_dates = chart_new_cols_df['Week_at_Peak']
chart_new_cols_df['Debut_Year'] = debut_dates.dt.year
chart_new_cols_df['Debut_Month'] = debut_dates.dt.strftime("%B")
chart_new_cols_df['Peak_Year'] = peak_dates.dt.year
chart_new_cols_df['Peak_Month'] = peak_dates.dt.strftime("%B")

In [230]:
# Create season column:

def season_helper(month):
    if month in ['December', 'January', 'February']:
        return 'Winter'
    if month in ['March', 'April', 'May']:
        return 'Spring'
    if month in ['June', 'July', 'August']:
        return 'Summer'
    else:
        return 'Fall'

In [231]:
chart_new_cols_df['Debut_Season'] = chart_new_cols_df['Debut_Month'].apply(season_helper)
chart_new_cols_df['Peak_Season'] = chart_new_cols_df['Peak_Month'].apply(season_helper)


In [232]:
# Create decade column:

def decade_helper(year):
    if year in [1958, 1959]:
        return "50's"
    if year in [1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969]:
        return "60's"
    if year in [1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979]:
        return "70's"
    if year in [1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989]:
        return "80's"
    if year in [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999]:
        return "90's"
    if year in [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]:
        return "00's"
    else:
        return "10's"

In [234]:
chart_new_cols_df['Debut_Decade'] = chart_new_cols_df['Debut_Year'].apply(decade_helper)
chart_new_cols_df['Peak_Decade'] = chart_new_cols_df['Peak_Year'].apply(decade_helper)


In [235]:
chart_new_cols_df

Unnamed: 0,SongID,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart,Debut_Year,Debut_Month,Peak_Year,Peak_Month,Debut_Season,Peak_Season,Debut_Decade,Peak_Decade
0,"""B"" GirlsYoung And Restless",1990-05-05,84,54,1990-06-16,15,1990,May,1990,June,Spring,Summer,90's,90's
1,"""Cherry Cherry"" from Hot August NightNeil Diamond",1973-03-17,84,31,1973-05-05,10,1973,March,1973,May,Spring,Spring,70's,70's
2,#1 Dee JayGoody Goody,1978-11-18,88,82,1978-12-02,5,1978,November,1978,December,Fall,Winter,70's,70's
3,#1Nelly,2001-10-20,75,22,2001-12-15,20,2001,October,2001,December,Fall,Winter,00's,00's
4,#9 DreamJohn Lennon,1974-12-21,68,9,1975-02-22,12,1974,December,1975,February,Winter,Winter,70's,70's
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21774,teachmeMusiq Soulchild,2007-05-26,92,42,2007-08-18,20,2007,May,2007,August,Spring,Summer,00's,00's
21775,whoa (mind in awe)XXXTENTACION,2018-12-22,37,37,2018-12-22,3,2018,December,2018,December,Winter,Winter,10's,10's
21776,whoknowsMusiq,2004-04-24,75,65,2004-05-08,13,2004,April,2004,May,Spring,Spring,00's,00's
21777,www.memoryAlan Jackson,2000-11-04,75,45,2000-12-23,15,2000,November,2000,December,Fall,Winter,00's,00's


In [248]:
chart_top10_df = chart_new_cols_df.loc[chart_new_cols_df.loc[:, 'Peak_Position'] < 11]
chart_top10_df['Time_to_Top10'] = chart_top10_df['Week_at_Peak'] - chart_top10_df['Debut_Week']
chart_top10_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,SongID,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart,Debut_Year,Debut_Month,Peak_Year,Peak_Month,Debut_Season,Peak_Season,Debut_Decade,Peak_Decade,Time_to_Top10
4,#9 DreamJohn Lennon,1974-12-21,68,9,1975-02-22,12,1974,December,1975,February,Winter,Winter,70's,70's,63 days
9,'65 Love AffairPaul Davis,1982-02-27,82,6,1982-05-22,20,1982,February,1982,May,Winter,Spring,80's,80's,84 days
15,('til) I Kissed YouThe Everly Brothers,1959-08-15,56,4,1959-09-19,16,1959,August,1959,September,Summer,Fall,50's,50's,35 days
30,(Everything I Do) I Do It For YouBryan Adams,1991-06-29,53,1,1991-07-27,22,1991,June,1991,July,Summer,Summer,90's,90's,28 days
37,(Hey Won't You Play) Another Somebody Done Som...,1975-02-01,99,1,1975-04-26,18,1975,February,1975,April,Winter,Spring,70's,70's,84 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21740,Your Wildest DreamsThe Moody Blues,1986-04-19,95,9,1986-07-12,21,1986,April,1986,July,Spring,Summer,80's,80's,84 days
21753,Yummy Yummy YummyOhio Express,1968-05-04,88,4,1968-06-15,14,1968,May,1968,June,Spring,Summer,60's,60's,42 days
21754,ZEZEKodak Black Featuring Travis Scott & Offset,2018-10-27,2,2,2018-10-27,25,2018,October,2018,October,Fall,Fall,10's,10's,0 days
21761,Zip-A-Dee Doo-DahBob B. Soxx And The Blue Jeans,1962-11-17,75,8,1963-01-12,13,1962,November,1963,January,Fall,Winter,60's,60's,56 days


In [249]:
chart_top10_streak_df = chart_total_df.loc[chart_total_df.loc[:, 'Peak_Position'] < 11]
chart_top10_count_df = chart_top10_streak_df.groupby(['SongID']).agg({'SongID': 'count'})
chart_top10_count_df.columns = ['Weeks_in_Top10']
chart_top10_count_df

Unnamed: 0_level_0,Weeks_in_Top10
SongID,Unnamed: 1_level_1
#9 DreamJohn Lennon,4
'65 Love AffairPaul Davis,12
('til) I Kissed YouThe Everly Brothers,13
(Everything I Do) I Do It For YouBryan Adams,19
(Hey Won't You Play) Another Somebody Done Somebody Wrong SongB.J. Thomas,9
...,...
Your Wildest DreamsThe Moody Blues,9
Yummy Yummy YummyOhio Express,10
ZEZEKodak Black Featuring Travis Scott & Offset,25
Zip-A-Dee Doo-DahBob B. Soxx And The Blue Jeans,7


In [250]:
chart_top10_df = chart_top10_df.merge(chart_top10_count_df, how='left', on='SongID')


In [251]:
chart_top10_df_merge = chart_top10_df.loc[:, ['SongID', 'Time_to_Top10', 'Weeks_in_Top10']]


In [252]:
chart_top10_df_merge

Unnamed: 0,SongID,Time_to_Top10,Weeks_in_Top10
0,#9 DreamJohn Lennon,63 days,4
1,'65 Love AffairPaul Davis,84 days,12
2,('til) I Kissed YouThe Everly Brothers,35 days,13
3,(Everything I Do) I Do It For YouBryan Adams,28 days,19
4,(Hey Won't You Play) Another Somebody Done Som...,84 days,9
...,...,...,...
4115,Your Wildest DreamsThe Moody Blues,84 days,9
4116,Yummy Yummy YummyOhio Express,42 days,10
4117,ZEZEKodak Black Featuring Travis Scott & Offset,0 days,25
4118,Zip-A-Dee Doo-DahBob B. Soxx And The Blue Jeans,56 days,7


In [253]:
chart_new_cols_df = chart_new_cols_df.merge(chart_top10_df_merge, how='left', on='SongID')



In [256]:
chart_new_cols_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21779 entries, 0 to 21778
Data columns (total 16 columns):
SongID            21779 non-null object
Debut_Week        21779 non-null datetime64[ns]
Debut_Position    21779 non-null int64
Peak_Position     21779 non-null int64
Week_at_Peak      21779 non-null datetime64[ns]
Weeks_on_Chart    21779 non-null int64
Debut_Year        21779 non-null int64
Debut_Month       21779 non-null object
Peak_Year         21779 non-null int64
Peak_Month        21779 non-null object
Debut_Season      21779 non-null object
Peak_Season       21779 non-null object
Debut_Decade      21779 non-null object
Peak_Decade       21779 non-null object
Time_to_Top10     4120 non-null timedelta64[ns]
Weeks_in_Top10    4120 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(5), object(7), timedelta64[ns](1)
memory usage: 2.8+ MB


In [261]:
# Delete these columns for more seamless merge with new columns

del chart_total_df['Peak_Position']
del chart_total_df['Weeks_on_Chart']

In [262]:
chart_total_features_df = chart_total_df.merge(chart_new_cols_df, how='left', on='SongID')

In [264]:
chart_total_features_df

Unnamed: 0,WeekID,Week_Position,Song,Performer,SongID,Instance,Previous_Week_Position,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart,Debut_Year,Debut_Month,Peak_Year,Peak_Month,Debut_Season,Peak_Season,Debut_Decade,Peak_Decade,Time_to_Top10,Weeks_in_Top10
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4,1958-08-02,1,1,1958-08-02,11,1958,August,1958,August,Summer,Summer,50's,50's,0 days,11.0
1,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4,1958-08-02,3,3,1958-08-02,8,1958,August,1958,August,Summer,Summer,50's,50's,0 days,8.0
2,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4,1958-08-02,5,5,1958-08-02,9,1958,August,1958,August,Summer,Summer,50's,50's,0 days,9.0
3,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4,1958-08-02,7,7,1958-08-02,7,1958,August,1958,August,Summer,Summer,50's,50's,0 days,7.0
4,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4,1958-08-02,8,3,1958-08-16,14,1958,August,1958,August,Summer,Summer,50's,50's,14 days,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258792,2019-06-22,95,La La Land,Bryce Vine Featuring YG,La La LandBryce Vine Featuring YG,1,92.0,4oM9DGUTrRaXJDypxxHztZ,0.774,0.473,C#,-5.122,Minor,0.221,0.000e+00,0.222,0.416,97.953,2.954,4/4,2019-06-15,92,92,2019-06-15,2,2019,June,2019,June,Summer,Summer,10's,10's,NaT,
258793,2019-06-22,96,Robbery,Juice WRLD,RobberyJuice WRLD,1,87.0,6Hj9jySrnFppAI0sEMCZpJ,0.685,0.692,D,-5.122,Major,0.328,0.000e+00,0.153,0.578,159.966,4.001,5/4,2019-03-02,27,27,2019-03-02,17,2019,March,2019,March,Spring,Spring,10's,10's,NaT,
258794,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4,2019-06-22,98,98,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,
258795,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4,2019-06-22,99,99,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,


In [265]:
chart_total_features_df['Weeks_in_Top10'].fillna(0, inplace=True)

In [280]:
chart_total_features_df['Top_10_Song'] = chart_total_features_df.loc[:, 'Peak_Position'] < 11


In [281]:
chart_total_features_df['Top_10_Song'].replace(True, 'Yes', inplace=True)
chart_total_features_df['Top_10_Song'].replace(False, 'No', inplace=True)

In [282]:
chart_total_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258797 entries, 0 to 258796
Data columns (total 36 columns):
WeekID                    258797 non-null datetime64[ns]
Week_Position             258797 non-null int64
Song                      258797 non-null object
Performer                 258797 non-null object
SongID                    258797 non-null object
Instance                  258797 non-null int64
Previous_Week_Position    235155 non-null float64
TrackID                   258797 non-null object
Danceability              258797 non-null float64
Energy                    258797 non-null float64
Key                       258797 non-null object
Loudness                  258797 non-null float64
Mode                      258797 non-null object
Acousticness              258797 non-null float64
Instrumentalness          258797 non-null float64
Liveness                  258797 non-null float64
Valence                   258797 non-null float64
Tempo                     258797 non-null 

In [283]:
# with open('chart_total_features_df.pkl', 'wb') as to_write:
#     pickle.dump(chart_total_features_df, to_write)

In [284]:
with open('chart_total_features_df.pkl','rb') as read_file:
    chart_total_features_df = pickle.load(read_file)

In [285]:
songs_total_features_df = chart_total_features_df.drop_duplicates(subset=['SongID'])
songs_total_features_df

Unnamed: 0,WeekID,Week_Position,Song,Performer,SongID,Instance,Previous_Week_Position,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart,Debut_Year,Debut_Month,Peak_Year,Peak_Month,Debut_Season,Peak_Season,Debut_Decade,Peak_Decade,Time_to_Top10,Weeks_in_Top10,Top_10_Song
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4,1958-08-02,1,1,1958-08-02,11,1958,August,1958,August,Summer,Summer,50's,50's,0 days,11.0,Yes
1,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4,1958-08-02,3,3,1958-08-02,8,1958,August,1958,August,Summer,Summer,50's,50's,0 days,8.0,Yes
2,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4,1958-08-02,5,5,1958-08-02,9,1958,August,1958,August,Summer,Summer,50's,50's,0 days,9.0,Yes
3,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4,1958-08-02,7,7,1958-08-02,7,1958,August,1958,August,Summer,Summer,50's,50's,0 days,7.0,Yes
4,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4,1958-08-02,8,3,1958-08-16,14,1958,August,1958,August,Summer,Summer,50's,50's,14 days,14.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258789,2019-06-22,92,Amor Genuino,Ozuna,Amor GenuinoOzuna,1,,6pK5edDoyrn4mWwRSVc2a5,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4,2019-06-22,92,92,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258790,2019-06-22,93,Only Human,Jonas Brothers,Only HumanJonas Brothers,1,,0DiDStADDVh3SvAsoJAFMk,0.795,0.496,C,-5.883,Major,0.108,0.000e+00,0.065,0.874,94.010,3.050,4/4,2019-06-22,93,93,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258794,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4,2019-06-22,98,98,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258795,2019-06-22,99,Government Official,Future,Government OfficialFuture,1,,2QEnZJ8FwpBoyLKcFcgceT,0.801,0.774,C#,-5.125,Major,0.036,3.030e-06,0.114,0.321,144.988,2.513,4/4,2019-06-22,99,99,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No


In [286]:
import billboard

date_list = ['2019-06-23']*100
rank_list = list(range(1,101))
peak_list = []
artist_list = []
song_list = []
songID_list = []

chart = billboard.ChartData('hot-100', date = '2019-06-23')

for rank in range(0,100):
    artist_list.append(chart.entries[rank].artist)
    song_list.append(chart.entries[rank].title)
    peak_list.append(chart.entries[rank].peakPos)
    songID_list.append(chart.entries[rank].title +    \
                       chart.entries[rank].artist)


song_dict = {'Date': date_list, 'Rank': rank_list,    \
             'Peak_Pos': peak_list, 'Artist': artist_list,    \
             'Song': song_list, 'SongID': songID_list}

song0623_df = pd.DataFrame(song_dict)

In [287]:
song0623_df

Unnamed: 0,Date,Rank,Peak_Pos,Artist,Song,SongID
0,2019-06-23,1,1,Lil Nas X Featuring Billy Ray Cyrus,Old Town Road,Old Town RoadLil Nas X Featuring Billy Ray Cyrus
1,2019-06-23,2,2,Taylor Swift,You Need To Calm Down,You Need To Calm DownTaylor Swift
2,2019-06-23,3,2,Billie Eilish,Bad Guy,Bad GuyBillie Eilish
3,2019-06-23,4,3,Khalid,Talk,TalkKhalid
4,2019-06-23,5,2,Ed Sheeran & Justin Bieber,I Don't Care,I Don't CareEd Sheeran & Justin Bieber
5,2019-06-23,6,1,Jonas Brothers,Sucker,SuckerJonas Brothers
6,2019-06-23,7,7,Drake Featuring Rick Ross,Money In The Grave,Money In The GraveDrake Featuring Rick Ross
7,2019-06-23,8,2,Post Malone,Wow.,Wow.Post Malone
8,2019-06-23,9,8,DaBaby,Suge,SugeDaBaby
9,2019-06-23,10,9,Chris Brown Featuring Drake,No Guidance,No GuidanceChris Brown Featuring Drake


In [288]:
chart0622_total_df = chart_total_df[-100:]

In [289]:
chart0622_set = set(chart0622_total_df['Song'])
song0623_set = set(song0623_df['Song'])

In [290]:
chart0622_set2 = set(chart0622_total_df['SongID'])
song0623_set2 = set(song0623_df['SongID'])

In [291]:
songs_to_delete = chart0622_set.intersection(song0623_set)
songs_to_delete2 = chart0622_set2.intersection(song0623_set2)

In [292]:
songs_features_clean_df = songs_total_features_df[~songs_total_features_df.SongID.isin(songs_to_delete2)]
songs_features_clean_df.drop(index=258796, inplace=True)
songs_features_clean_df.drop(index=254442, inplace=True)

In [293]:
songs_features_clean_df

Unnamed: 0,WeekID,Week_Position,Song,Performer,SongID,Instance,Previous_Week_Position,TrackID,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature,Debut_Week,Debut_Position,Peak_Position,Week_at_Peak,Weeks_on_Chart,Debut_Year,Debut_Month,Peak_Year,Peak_Month,Debut_Season,Peak_Season,Debut_Decade,Peak_Decade,Time_to_Top10,Weeks_in_Top10,Top_10_Song
0,1958-08-02,1,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,,5ayybTSXNwcarDtxQKqvWX,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4,1958-08-02,1,1,1958-08-02,11,1958,August,1958,August,Summer,Summer,50's,50's,0 days,11.0,Yes
1,1958-08-02,3,Splish Splash,Bobby Darin,Splish SplashBobby Darin,1,,40fD7ct05FvQHLdQTgJelG,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4,1958-08-02,3,3,1958-08-02,8,1958,August,1958,August,Summer,Summer,50's,50's,0 days,8.0,Yes
2,1958-08-02,5,When,Kalin Twins,WhenKalin Twins,1,,1BQOMZeYwzesvXANmRgrEp,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4,1958-08-02,5,5,1958-08-02,9,1958,August,1958,August,Summer,Summer,50's,50's,0 days,9.0,Yes
3,1958-08-02,7,Yakety Yak,The Coasters,Yakety YakThe Coasters,1,,7demHL0GXA6YmTNqw3Btz8,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4,1958-08-02,7,7,1958-08-02,7,1958,August,1958,August,Summer,Summer,50's,50's,0 days,7.0,Yes
4,1958-08-02,8,My True Love,Jack Scott,My True LoveJack Scott,1,,2Un1O9CChp4wyTmt422XwW,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4,1958-08-02,8,3,1958-08-16,14,1958,August,1958,August,Summer,Summer,50's,50's,14 days,14.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258781,2019-06-22,83,Heaven,Avicii,HeavenAvicii,1,,0vrmHPfoBadXVr2n0m1aqZ,0.592,0.873,A,-5.650,Major,0.071,9.820e-03,0.142,0.516,122.011,4.621,4/4,2019-06-22,83,83,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258785,2019-06-22,87,Please Tell Me,Future,Please Tell MeFuture,1,,4ANFPo7dzs2DdPuONXC9bp,0.881,0.549,Bb,-5.589,Major,0.009,0.000e+00,0.137,0.202,144.019,3.409,4/4,2019-06-22,87,87,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258789,2019-06-22,92,Amor Genuino,Ozuna,Amor GenuinoOzuna,1,,6pK5edDoyrn4mWwRSVc2a5,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4,2019-06-22,92,92,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No
258794,2019-06-22,98,XanaX Damage,Future,XanaX DamageFuture,1,,3tZ4zv4UuenjJGSNUMfBNG,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4,2019-06-22,98,98,2019-06-22,1,2019,June,2019,June,Summer,Summer,10's,10's,NaT,0.0,No


In [294]:
songs_features_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21696 entries, 0 to 258795
Data columns (total 36 columns):
WeekID                    21696 non-null datetime64[ns]
Week_Position             21696 non-null int64
Song                      21696 non-null object
Performer                 21696 non-null object
SongID                    21696 non-null object
Instance                  21696 non-null int64
Previous_Week_Position    3 non-null float64
TrackID                   21696 non-null object
Danceability              21696 non-null float64
Energy                    21696 non-null float64
Key                       21696 non-null object
Loudness                  21696 non-null float64
Mode                      21696 non-null object
Acousticness              21696 non-null float64
Instrumentalness          21696 non-null float64
Liveness                  21696 non-null float64
Valence                   21696 non-null float64
Tempo                     21696 non-null float64
Duration_min   

In [297]:
songs_features_clean_df.columns

Index(['WeekID', 'Week_Position', 'Song', 'Performer', 'SongID', 'Instance',
       'Previous_Week_Position', 'TrackID', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Duration_min', 'Time_Signature', 'Debut_Week',
       'Debut_Position', 'Peak_Position', 'Week_at_Peak', 'Weeks_on_Chart',
       'Debut_Year', 'Debut_Month', 'Peak_Year', 'Peak_Month', 'Debut_Season',
       'Peak_Season', 'Debut_Decade', 'Peak_Decade', 'Time_to_Top10',
       'Weeks_in_Top10', 'Top_10_Song'],
      dtype='object')

In [298]:
songs_features_final_df = songs_features_clean_df[['Top_10_Song', 'Song', 'Performer', 'SongID', 'Debut_Position', 'Debut_Week', 'Debut_Month', 'Debut_Season', 'Debut_Year', 'Debut_Decade',    \
                                                   'Peak_Position', 'Week_at_Peak', 'Peak_Month', 'Peak_Season', 'Peak_Year', 'Peak_Decade', 'Weeks_on_Chart', 'Weeks_in_Top10', 'Time_to_Top10',    \
                                                   'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Acousticness', 'Instrumentalness', 'Liveness',    \
                                                   'Valence', 'Tempo', 'Duration_min', 'Time_Signature']]



In [301]:
songs_features_final_df

Unnamed: 0,Top_10_Song,Song,Performer,SongID,Debut_Position,Debut_Week,Debut_Month,Debut_Season,Debut_Year,Debut_Decade,Peak_Position,Week_at_Peak,Peak_Month,Peak_Season,Peak_Year,Peak_Decade,Weeks_on_Chart,Weeks_in_Top10,Time_to_Top10,Danceability,Energy,Key,Loudness,Mode,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Time_Signature
0,Yes,Poor Little Fool,Ricky Nelson,Poor Little FoolRicky Nelson,1,1958-08-02,August,Summer,1958,50's,1,1958-08-02,August,Summer,1958,50's,11,11.0,0 days,0.543,0.332,C,-11.573,Major,0.669,0.000e+00,0.134,0.795,154.999,2.566,4/4
1,Yes,Splish Splash,Bobby Darin,Splish SplashBobby Darin,3,1958-08-02,August,Summer,1958,50's,3,1958-08-02,August,Summer,1958,50's,8,8.0,0 days,0.645,0.943,C,-1.526,Major,0.385,0.000e+00,0.370,0.965,147.768,2.195,4/4
2,Yes,When,Kalin Twins,WhenKalin Twins,5,1958-08-02,August,Summer,1958,50's,5,1958-08-02,August,Summer,1958,50's,9,9.0,0 days,0.646,0.582,F#,-12.679,Major,0.168,4.550e-06,0.976,0.963,96.490,2.410,4/4
3,Yes,Yakety Yak,The Coasters,Yakety YakThe Coasters,7,1958-08-02,August,Summer,1958,50's,7,1958-08-02,August,Summer,1958,50's,7,7.0,0 days,0.715,0.669,G,-9.491,Major,0.705,7.320e-04,0.044,0.976,120.789,1.884,4/4
4,Yes,My True Love,Jack Scott,My True LoveJack Scott,8,1958-08-02,August,Summer,1958,50's,3,1958-08-16,August,Summer,1958,50's,14,14.0,14 days,0.548,0.253,E,-11.387,Major,0.871,9.880e-05,0.138,0.238,68.184,2.750,3/4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258781,No,Heaven,Avicii,HeavenAvicii,83,2019-06-22,June,Summer,2019,10's,83,2019-06-22,June,Summer,2019,10's,1,0.0,NaT,0.592,0.873,A,-5.650,Major,0.071,9.820e-03,0.142,0.516,122.011,4.621,4/4
258785,No,Please Tell Me,Future,Please Tell MeFuture,87,2019-06-22,June,Summer,2019,10's,87,2019-06-22,June,Summer,2019,10's,1,0.0,NaT,0.881,0.549,Bb,-5.589,Major,0.009,0.000e+00,0.137,0.202,144.019,3.409,4/4
258789,No,Amor Genuino,Ozuna,Amor GenuinoOzuna,92,2019-06-22,June,Summer,2019,10's,92,2019-06-22,June,Summer,2019,10's,1,0.0,NaT,0.442,0.445,Bb,-4.071,Minor,0.653,0.000e+00,0.118,0.423,85.744,2.999,4/4
258794,No,XanaX Damage,Future,XanaX DamageFuture,98,2019-06-22,June,Summer,2019,10's,98,2019-06-22,June,Summer,2019,10's,1,0.0,NaT,0.551,0.609,G,-8.975,Major,0.368,1.540e-04,0.192,0.324,139.084,1.734,4/4


In [299]:
# with open('songs_features_final_df.pkl', 'wb') as to_write:
#     pickle.dump(songs_features_final_df, to_write)

In [300]:
# with open('songs_features_clean_df.pkl', 'wb') as to_write:
#     pickle.dump(songs_features_clean_df, to_write)