In [3]:
import pandas as pd

Spotify_Data = pd.read_csv('Popular_Spotify_Songs.csv')
Spotify_Data['mode'] = Spotify_Data['mode'].map({'Major': 0, 'Minor': 1})
Spotify_Data.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703.0,43,...,125,B,0,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286.0,48,...,92,C#,0,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974.0,94,...,138,F,0,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817.0,116,...,170,A,0,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322.0,84,...,144,A,1,65,23,80,14,63,11,6


In [6]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Define column names, columns you will include in pipeline
numerical_columns = ['streams', 'bpm', 'mode']
categorical_columns = ['key']

#pipeline set for imputation strategy, the strategy is set to fill  in mean values in every missing numerical value
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

#pipeline set for imputation strategy, the strategy is set to fill in the most frequent string in every missing categorical value
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

#organizes preprocessing for next step, determines which transformers to apply for numerical/categorical
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numerical_columns),
    ('categorical', categorical_transformer, categorical_columns)
])

#sets up next steps for it to preprocessed later, creates pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

#initiates algorithms for imputation on Spotify_data
Spotify_Pipeline_Data = pipeline.fit_transform(Spotify_Data)

#transforms data into a dataframe using panda
Spotify_Pipeline_Data = pd.DataFrame(Spotify_Pipeline_Data, columns=numerical_columns +categorical_columns)

Spotify_Pipeline_Data['mode'] = Spotify_Pipeline_Data['mode'].astype(int)
Spotify_Pipeline_Data['bpm'] = Spotify_Pipeline_Data['bpm'].astype(int)
Spotify_Pipeline_Data['streams'] = Spotify_Pipeline_Data['streams'].astype(np.int64)


#check to see if it is a dataframe 
Spotify_Pipeline_Data.head()

Unnamed: 0,streams,bpm,mode,key
0,141381703,125,0,B
1,133716286,92,0,C#
2,140003974,138,0,F
3,800840817,170,0,A
4,303236322,144,1,A
