# Spotify: Popularity Prediction

In [1]:
import pandas as pd

# Load the dataset

df = pd.read_csv('data/Spotify_Songs.csv')

## Preprocessing

### General Info

In [2]:
# General Info
print('Dataset Info:')
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  livene

### Missing Values 

In [4]:
# Remove rows with Missing Values
df = df.dropna()

# # Verify that missing values are removed
# print('Dataset Info without Missing Values:')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             113999 non-null  int64  
 1   track_id          113999 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        113999 non-null  int64  
 6   duration_ms       113999 non-null  int64  
 7   explicit          113999 non-null  bool   
 8   danceability      113999 non-null  float64
 9   energy            113999 non-null  float64
 10  key               113999 non-null  int64  
 11  loudness          113999 non-null  float64
 12  mode              113999 non-null  int64  
 13  speechiness       113999 non-null  float64
 14  acousticness      113999 non-null  float64
 15  instrumentalness  113999 non-null  float64
 16  liveness          113999 

### Genre Summarization

In [None]:
# keep original_track_genre

# add new superior_track_genre

### Duplicate Values

In [None]:
# find duplicates in df[track_id]

# find all superior genres for one track_id (a df with columns track_id and superior_track_genre)

# create new df with identical structure as old df (insert all superior_track_genre in this column, separate values with ",")

# replace songs with all superior_track_genres for single track_genre values songs in original df

# expand all superior_track_genre to binary system 1/0 for each genre

# fill in "1" and "0"

# make sure there aren't any additional duplicate values:
# Remove Dupliate Values WE HAVE TO make sure which songs are identical but have the same genre
#df = df.drop_duplicates()

# # Verify that duplicates are removed
# print('Dataset Info without Duplicates:')
#df.info()

### Irrelevant Features

In [6]:
# How do the textual attributes look like?
print('Textual Info:')
df.describe(include='object')

# Remove "artists", "album_name" and "track_name"  features (KEEP index, track_id)
df = df.drop(columns=['artists', 'album_name', 'track_name'])

# # Verify that irrelevant features are removed
# print('Dataset Info without Irrelevant Features:')
df.info()

Textual Info:
<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             113999 non-null  int64  
 1   track_id          113999 non-null  object 
 2   popularity        113999 non-null  int64  
 3   duration_ms       113999 non-null  int64  
 4   explicit          113999 non-null  bool   
 5   danceability      113999 non-null  float64
 6   energy            113999 non-null  float64
 7   key               113999 non-null  int64  
 8   loudness          113999 non-null  float64
 9   mode              113999 non-null  int64  
 10  speechiness       113999 non-null  float64
 11  acousticness      113999 non-null  float64
 12  instrumentalness  113999 non-null  float64
 13  liveness          113999 non-null  float64
 14  valence           113999 non-null  float64
 15  tempo             113999 non-null  float64
 16  time_signat

### Outliers

In [7]:
# Choose rows with a duration of <1min & >10min, a time signature and tempo of 0
drop_clause = (df['duration_ms'] < 60000) | (df['duration_ms'] > 600000) | (df['time_signature'] == 0) | (df['tempo'] == 0)

# Find the index of that condition in our dataset
drop_index = df[drop_clause].index

# Drop those rows
df = df.drop(drop_index)


# Verify that outliers are removed
print('Dataset without Outliers:')
df.info()

Dataset without Outliers:
<class 'pandas.core.frame.DataFrame'>
Index: 112392 entries, 0 to 113999
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             112392 non-null  int64  
 1   track_id          112392 non-null  object 
 2   popularity        112392 non-null  int64  
 3   duration_ms       112392 non-null  int64  
 4   explicit          112392 non-null  bool   
 5   danceability      112392 non-null  float64
 6   energy            112392 non-null  float64
 7   key               112392 non-null  int64  
 8   loudness          112392 non-null  float64
 9   mode              112392 non-null  int64  
 10  speechiness       112392 non-null  float64
 11  acousticness      112392 non-null  float64
 12  instrumentalness  112392 non-null  float64
 13  liveness          112392 non-null  float64
 14  valence           112392 non-null  float64
 15  tempo             112392 non-null  float64
 16 