# Imports

In [2]:
!pip install spotipy



In [3]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

## Set ENV variables

In [4]:
%env CLIENT_ID 2753defd10e2496f80b0d2348c2b5481
%env CLIENT_SECRET 34a98fb5da1b4c7698f93c077db6b127

env: CLIENT_ID=2753defd10e2496f80b0d2348c2b5481
env: CLIENT_SECRET=34a98fb5da1b4c7698f93c077db6b127


In [5]:
client_id = %env CLIENT_ID
client_secret = %env CLIENT_SECRET

## Setting Credentials

In [6]:
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id, client_secret))

## Getting the dataset with the newest songs

In [7]:
data = pd.read_csv('/home/mdbruchard/code/dmassonfr/Spotify_hit_predictor/raw_data/new_songs.csv')

In [8]:
data['release_date'] = pd.to_datetime(data['release_date'])

# Get features

In [9]:
# Function that take playlist id and turn into dataframe

def playlist_to_df(pl_id: str):
    
    
    response = sp.playlist(pl_id)
    if not response:
        print("error with response")
        return None
    
    # Saving the basic features
    name = [response['tracks']['items'][i]['track']['name'] for i in range(len(response['tracks']['items']))]
    artist = [response['tracks']['items'][i]['track']['artists'][0]['name'] for i in range(len(response['tracks']['items']))]
    artist_id = [response['tracks']['items'][i]['track']['album']['artists'][0]['id'] for i in range(len(response['tracks']['items']))]
    popularity = [response['tracks']['items'][i]['track']['popularity'] for i in range(len(response['tracks']['items']))]
    Id = [response['tracks']['items'][i]['track']['id'] for i in range(len(response['tracks']['items']))]
    release_date = [response['tracks']['items'][i]['track']['album']['release_date'] for i in range(len(response['tracks']['items']))]
    preview_url = [response['tracks']['items'][i]['track']['preview_url'] for i in range(len(response['tracks']['items']))]
    cover_album = [response['tracks']['items'][i]['track']['album']['images'][1]['url'] for i in range(len(response['tracks']['items']))]
    explicit = [response['tracks']['items'][i]['track']['explicit'] for i in range(len(response['tracks']['items']))]
    
    
    data = {}

    # Setting the features
    data['id'] = Id
    data['name'] = name
    data['popularity'] = popularity
    data['artists'] = artist
    data['id_artists'] = artist_id
    data['explicit'] = explicit

    data['release_date'] = release_date
    data['preview_url'] = preview_url
    data['cover_album'] = cover_album

    # turning into dataframe
    df = pd.DataFrame(data)

    #turninge the realise date into datetime
    df['release_date'] = pd.to_datetime(df['release_date'])

    # Turning explicit column in 0 or 1
    df['explicit'] = df['explicit'].map(lambda x: 1 if x == True else 0)
    
    # List of features set
    tracks = []

    # For each track in the 1st dataframe 
    for track_id in df.loc[:,'id']:
        track = f'spotify:track:{track_id}' # Get the track features
        audio_features = sp.audio_features(track) # Save in audio features variable
        tracks.append(audio_features[0]) # Append to the tracks list
        
    # New dataframe with all the tracks in the first dataframe
    df_features =  pd.DataFrame(tracks)
    
    data = df.merge(df_features, on='id')

    # Dropping columns not needed
    data.drop(['type', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1, inplace=True)
    
    return data    

In [10]:
# Fucntion that take the follower, genre and the popularity of a determinate artist in the dataframe

def get_followers(df: pd.DataFrame):
    
    # Creating the dataframe base with a dict
    artista = {'id_artists': [], 'genre': [], 'followers': [], 'artist_popularity': []}

    # For each artist id get the id, genre and followers and append to the dict
    for _id in df['id_artists']:
        artist = sp.artist(_id)
        artista['id_artists'].append(artist['id'])
        artista['genre'].append(artist['genres'])
        artista['followers'].append(artist['followers']['total'])
        artista['artist_popularity'].append(artist['popularity'])
    
    new_df = pd.DataFrame(artista)
    new_df['genre'] = new_df['genre'].apply(lambda x: ', '.join(map(str, x)))
    
    data = df.merge(new_df, on='id_artists')
    return data.drop_duplicates()
        

### Getting new songs

Use the `playlist_to_df` function to get playlist into dataframe, then filtering by the date after the end of 2021:
* Get the key on the spotify
* Apply the function to get the playlist as dataframe
* Apply the get `get_followers` function to get the details of the artist
* Do it the first two steps several time and merge in one dataset

In [11]:
df = playlist_to_df('37i9dQZF1DWY4xHQp97fN6')

In [12]:
df = df[df['release_date'] >= '2022-01-01']

In [13]:
df.shape

(98, 21)

In [14]:
df

Unnamed: 0,id,name,popularity,artists,id_artists,explicit,release_date,preview_url,cover_album,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,3BGcnN2i4Lm98vNrUcM37j,Essence Fest,53,Curren$y,6X8WdFjrNhXATMDSs26aCc,1,2023-03-17,https://p.scdn.co/mp3-preview/1e9f29d4d2e7a5d8...,https://i.scdn.co/image/ab67616d00001e0291c44f...,0.844,...,7,-4.222,1,0.226,0.043800,0.000000,0.103,0.609,101.009,181594
1,5wG3HvLhF6Y5KTGlK0IW3J,Trance (with Travis Scott & Young Thug),85,Metro Boomin,0iEtIxbK0KxaSlF7G42ZOp,1,2022-12-02,,https://i.scdn.co/image/ab67616d00001e0213e54d...,0.571,...,1,-7.380,0,0.404,0.180000,0.000000,0.168,0.447,119.497,194787
2,0yUaLqhsVsguBpoOPL4cO7,In Ha Mood,85,Ice Spice,3LZZPxNDGDFVSIPqf4JuEf,1,2023-01-06,,https://i.scdn.co/image/ab67616d00001e02cf9b9a...,0.768,...,0,-6.595,1,0.336,0.696000,0.000007,0.230,0.532,141.059,129362
4,0vjeOZ3Ft5jvAi9SBFJm1j,Superhero (Heroes & Villains) [with Future & C...,89,Metro Boomin,0iEtIxbK0KxaSlF7G42ZOp,1,2022-12-02,,https://i.scdn.co/image/ab67616d00001e0213e54d...,0.526,...,5,-5.300,0,0.259,0.152000,0.000002,0.194,0.492,116.622,182667
5,1bDbXMyjaUIooNwFE9wn0N,Rich Flex,90,Drake,3TVXtAsR1Inumwj472S9r4,1,2022-11-04,,https://i.scdn.co/image/ab67616d00001e0202854a...,0.561,...,11,-9.342,0,0.244,0.050300,0.000002,0.355,0.424,153.150,239360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3GdWfmQBiiJrDUvSZS1bGv,Never Sleep (with Lil Baby feat. Travis Scott),70,NAV,7rkW85dBwwrJtlHRDkJDAC,1,2022-07-29,,https://i.scdn.co/image/ab67616d00001e02d3c60e...,0.819,...,1,-5.829,1,0.169,0.006390,0.000000,0.122,0.501,139.957,185786
96,2TSfStvlAMLw89u3tali79,Dah Dah DahDah,73,Nardo Wick,0Njy6yR9LykNKYg9yE23QN,1,2022-07-22,https://p.scdn.co/mp3-preview/c5e728c3d11987cf...,https://i.scdn.co/image/ab67616d00001e02b61d76...,0.660,...,11,-10.234,1,0.595,0.149000,0.000000,0.106,0.350,167.912,151776
97,3F5CgOj3wFlRv51JsHbxhe,Jimmy Cooks (feat. 21 Savage),86,Drake,3TVXtAsR1Inumwj472S9r4,1,2022-06-17,,https://i.scdn.co/image/ab67616d00001e028dc0d8...,0.529,...,0,-4.711,1,0.175,0.000307,0.000002,0.093,0.366,165.921,218365
98,4L1zI1GJxi0Qq38aSYmsS0,Can't Stop Won't Stop (feat. Kodak Black),64,King Combs,41I5xI04kixwmonDBl0Sda,1,2022-07-22,https://p.scdn.co/mp3-preview/c499375e0c2ad1e3...,https://i.scdn.co/image/ab67616d00001e023697dc...,0.906,...,2,-4.401,1,0.294,0.177000,0.000000,0.110,0.813,96.961,158350


In [15]:
df = get_followers(df)

KeyboardInterrupt: 

In [None]:
df.shape

In [None]:
data = data.merge(df, on=[col for col in df.columns], how='outer').drop_duplicates()

In [None]:
data['popularity'].plot.hist(bins=30)

# RUN THE CELL BELOW TO SAVE THE DATASET

In [None]:
data.to_csv('/home/mdbruchard/code/dmassonfr/Spotify_hit_predictor/raw_data/new_songs.csv')