## Getting the Spotify Data Set

In [257]:
import pandas as pd

In [258]:
spotify_dat = pd.read_pickle('final_artist.pkl') # Used Spoitifyr to get this data set 

In [259]:
# Subset the year for spotify data so it matches the years we looked at for billboard
spotify_dat = spotify_dat.loc[spotify_dat["album_release_year"] >= 2009].reset_index(drop = True)

## Combining Spotify with bill board to get hit songs

In [260]:
bboard = pd.read_pickle("R&B10yrDF.pkl") # Acquired by cyrus 

In [261]:
# Checks which songs in spotify data are in the billboard top 100
spotify_dat["is_hit"] = spotify_dat['track_name'].isin(bboard["Title"]) 

# Reference isin function from stack overflow post: https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas/17071908

In [262]:
# Changing the labels for is_hit
spotify_dat.loc[spotify_dat["is_hit"] == True, "is_hit"] = 1
spotify_dat.loc[spotify_dat["is_hit"] == False, "is_hit"] = 0

# Reference to changing column values based off a condition:
# https://stackoverflow.com/questions/31511997/pandas-dataframe-replace-all-values-in-a-column-based-on-condition

In [263]:
spotify_dat["is_hit"].value_counts()

0    44061
1     5595
Name: is_hit, dtype: int64

**Reason for removing songs before 2009**
- proportion of hit to not hit is higher
- also removing older songs will be more representative of new songs of today's generation

In [264]:
spotify_dat.to_pickle("merged_data.pkl")

## Cleaning the data set and using only necessary variables

In [265]:
cleaned_df = spotify_dat.copy()

In [266]:
cleaned_df = cleaned_df.drop(["artist_name","artist_id","album_type","album_release_date","album_release_year","album_release_date_precision",
                "track_id","analysis_url","disc_number","available_markets","external_urls.spotify","track_uri","duration_ms","time_signature",
                "track_number","track_preview_url","is_local", "album_id","track_href","album_name",], axis = 1)

In [267]:
cleaned_df.dtypes

danceability        float64
energy              float64
key                   int32
loudness            float64
mode                  int32
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
explicit               bool
track_name           object
type                 object
key_name             object
mode_name            object
key_mode             object
is_hit                int64
dtype: object

In [268]:
cleaned_df.loc[spotify_dat["explicit"] == True, "explicit"] = 1
cleaned_df.loc[spotify_dat["explicit"] == False, "explicit"] = 0


In [269]:
cleaned_df = cleaned_df.set_index("track_name")
cleaned_df = cleaned_df.reset_index()

In [270]:
cleaned_df.to_pickle("Cleaned_data.pkl")