In [1]:
import matplotlib.pyplot as plt
import pandas as pd 
import spotipy 
import librosa
import librosa.display
import numpy as np
import matplotlib as mpl
import urllib.request
import seaborn as sns
import tqdm
import scipy
import joblib
import os.path
from pathlib import Path
from IPython.display import Audio, Markdown, Image
from spotipy.oauth2 import SpotifyClientCredentials

# Números y Datos
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.interpolate import interp1d

# Análisis de sonido
import librosa
import librosa.display
import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials 

# Machine learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from sklearn.preprocessing import quantile_transform

# Styles
sns.set_context('poster')
sns.set_style('darkgrid')

plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

In [2]:
# Extraemos los dataframes de nuestros files
df_tracks = pd.read_pickle('../sources/tracks.pickle')
df_af = pd.read_pickle('../sources/audio_features.pickle')
df_aa = pd.read_pickle('../sources/audio_analysis.pickle') \
    .set_index(['id','start']) \
    .sort_values(by=['id', 'start'], ascending=True, na_position='first')

In [3]:
# Generamos un dataframe combinando la data de tracks y de audio features
df_merged = df_tracks.merge(
    df_af, 
    on='id', 
    how='left'
)
df_merged = df_merged.drop(
    [
     'type_x',
     'type_y',
     'uri_x',
     'uri_y',
     'track_href',
     'analysis_url',
     'href',
     'preview_url',
     'external_ids',
     'duration_ms_y'
    ], 
    1
)

df_merged.rename(columns = {"duration_ms_x": "duration_ms"}, inplace = True)

In [4]:
# Extraemos para cada registro la variable release_date y la asignamos a una nueva columna en nuestro dataframe.

# Función que generamos para formatear la fecha
def date_formator(date):
    if '-' in date:
        year = pd.to_datetime(date, format = '%Y-%m-%d').year
        return int(year)
    elif int(date)>0:
        year = pd.to_datetime(date, format = '%Y').year
        return int(year)
    return None


df_merged['release_date'] = df_merged['album'].map(lambda x: x['release_date'])
df_merged['release_date'] = df_merged['release_date'].apply(date_formator)
df_merged['release_date'].head()

id
4d3XHYFFuYYzxWr2cJ6yQl    1991.0
3X3p3u03P8eFL8WTH0oaaU    1958.0
4KlXeaheot0OI9PoOWspvZ    2014.0
3j76McoUI18gDoqxYE14Bu    2014.0
4zxd4tiXPlWMqoJltbVTbE    1959.0
Name: release_date, dtype: float64

In [5]:
# Obtengo ids de album
df_merged['album_id'] = df_merged['album'].map(lambda x: x['id'])

# Agrego artistas y género
df_merged[["artists","genre"]] =  df_merged.loc[:,["artists","genre"]] 

In [6]:
# Vemos cuales son nuestras columnas
df_merged.columns.values

array(['album', 'artists', 'available_markets', 'disc_number',
       'duration_ms', 'explicit', 'external_urls', 'is_local', 'name',
       'popularity', 'track_number', 'genre', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'release_date', 'album_id'], dtype=object)

In [7]:
df_merged.sample(3).T

id,0PbkNsBkzlItqOC85tXCjH,4ASq8Q9pF05YGHxqL0PveM,6Sy9BUbgFse0n0LPA5lwy5
album,"{'album_type': 'ALBUM', 'artists': [{'external...","{'album_type': 'ALBUM', 'artists': [{'external...","{'album_type': 'ALBUM', 'artists': [{'external..."
artists,Sied Van Riel,Samuel Barber,Darude
available_markets,[],"[AD, AE, AL, AR, AT, AU, BA, BE, BG, BH, BO, B...","[AD, AE, AL, AR, AT, AU, BA, BE, BG, BH, BO, B..."
disc_number,1,1,1
duration_ms,469975,278426,225493
explicit,False,False,False
external_urls,{'spotify': 'https://open.spotify.com/track/0P...,{'spotify': 'https://open.spotify.com/track/4A...,{'spotify': 'https://open.spotify.com/track/6S...
is_local,False,False,False
name,Rush,"Piano Sonata, Op. 26: IV. Fuga: Allegro con sp...",Sandstorm
popularity,0,5,72


Desdoblamos los features de pitches y timbre en distintas columnas

In [8]:
df_m2 = df_aa

pitches_nombre = ['p00_C','p01_C#','p02_D','p03_D#','p04_E','p05_F',
                 'p06_F#','p07_G','p08_G#','p09_A','p10_A#','p11_B']
df1 = pd.DataFrame(df_m2['pitches'].tolist(), columns=pitches_nombre, index=df_m2.index)
df_m2 = pd.concat([df_m2,df1],axis=1)

timbre_nombre = ['t00','t01','t02','t03','t04','t05',
                 't06','t07','t08','t09','t10','t11']
df1 = pd.DataFrame(df_m2['timbre'].tolist(), columns=timbre_nombre, index=df_m2.index)
df_m2 = pd.concat([df_m2,df1],axis=1).drop(['pitches','timbre'],1)

df1 = None
df_m2.head(3).T

id,00At7PWydsvg7g5xgaYan9,00At7PWydsvg7g5xgaYan9,00At7PWydsvg7g5xgaYan9
start,0.00000,0.24381,0.41873
duration,0.24381,0.17492,0.17329
confidence,0.0,1.0,1.0
loudness_start,-60.0,-60.0,-34.279
loudness_max_time,0.0,0.01436,0.01554
loudness_max,-60.0,-12.457,-12.728
loudness_end,0.0,0.0,0.0
p00_C,1.0,0.121,0.032
p01_C#,1.0,0.996,1.0
p02_D,1.0,0.08,0.06
p03_D#,1.0,0.104,0.059


Extraemos los datos de MEDIANA + IQR (RANGO INTERCUARTIL)

In [9]:
grouped = df_m2.drop(['duration'],1).groupby(level='id')
medianas = grouped.quantile(.5)
iqr = grouped.quantile(.75)-grouped.quantile(.25)
medias = grouped.mean()
desvios = grouped.std()

In [10]:
df_pitches_timbres_medias_std = pd.merge(medias, desvios, on='id', suffixes=('_media', '_std'))
df_pitches_timbres_medianas_iqr = pd.merge(medianas, iqr, on='id', suffixes=('_mediana', '_iqr'))

Y ahora hacemos lo mismo para la derivada de las funciones

In [12]:
df_m2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,confidence,loudness_start,loudness_max_time,loudness_max,loudness_end,p00_C,p01_C#,p02_D,p03_D#,...,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11
id,start,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
00At7PWydsvg7g5xgaYan9,0.0,0.24381,0.0,-60.0,0.0,-60.0,0.0,1.0,1.0,1.0,1.0,...,9.469,-28.48,57.491,-50.067,14.833,5.359,-27.228,0.973,-10.64,-7.228
00At7PWydsvg7g5xgaYan9,0.24381,0.17492,1.0,-60.0,0.01436,-12.457,0.0,0.121,0.996,0.08,0.104,...,-30.613,137.462,36.96,-51.024,-18.745,-28.822,-7.17,-1.994,-15.695,9.411
00At7PWydsvg7g5xgaYan9,0.41873,0.17329,1.0,-34.279,0.01554,-12.728,0.0,0.032,1.0,0.06,0.059,...,-22.798,149.223,31.833,-53.446,-36.485,-39.479,-25.795,-1.903,-21.908,15.934
00At7PWydsvg7g5xgaYan9,0.59202,0.17351,1.0,-32.474,0.01059,-10.984,0.0,0.286,1.0,0.237,0.277,...,9.159,164.086,33.475,-83.62,-51.036,-23.505,-19.636,8.674,-18.95,6.718
00At7PWydsvg7g5xgaYan9,0.76553,0.17569,1.0,-32.786,0.01254,-9.48,0.0,0.064,0.878,0.028,0.129,...,-50.706,130.054,-4.597,-82.764,-32.596,-48.432,-21.086,20.443,-27.538,25.327
00At7PWydsvg7g5xgaYan9,0.94122,0.16617,1.0,-29.257,0.01003,-14.634,0.0,0.153,0.89,0.07,0.057,...,-31.294,78.563,17.387,-83.835,-39.964,-32.097,-13.299,-3.069,-10.472,22.175
00At7PWydsvg7g5xgaYan9,1.10739,0.17483,1.0,-28.415,0.01421,-10.546,0.0,0.209,1.0,0.121,0.116,...,4.682,152.144,66.114,-68.732,-40.88,-20.27,4.995,7.442,-24.421,14.912
00At7PWydsvg7g5xgaYan9,1.28222,0.15841,1.0,-31.659,0.013,-7.907,0.0,0.056,1.0,0.061,0.114,...,-36.092,129.86,38.891,-85.234,-36.44,-35.347,-1.687,10.804,-22.895,47.283
00At7PWydsvg7g5xgaYan9,1.44063,0.18553,1.0,-28.176,0.02452,-8.306,0.0,0.059,1.0,0.041,0.096,...,-20.439,127.598,60.108,35.909,9.787,-46.725,-5.205,-14.752,27.495,-0.86
00At7PWydsvg7g5xgaYan9,1.62617,0.17361,0.973,-27.416,0.01273,-13.883,0.0,0.406,1.0,0.292,0.388,...,-19.722,113.905,40.764,-74.562,-29.151,-4.066,9.487,-7.74,-6.329,1.813


In [13]:
dff = df_m2
shifted = dff.groupby(level="id").shift(-1)
dif = shifted.drop(['duration'],1).subtract(dff.drop(['duration'],1), fill_value=0)
diferencias = dif.divide(dff['duration'], axis="index")
diferencias.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,confidence,loudness_start,loudness_max_time,loudness_max,loudness_end,p00_C,p01_C#,p02_D,p03_D#,p04_E,...,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11
id,start,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
00At7PWydsvg7g5xgaYan9,0.0,4.101554,0.0,0.058898,195.000205,0.0,-3.605266,-0.016406,-3.77343,-3.674993,0.0,...,-164.398507,680.620155,-84.209015,-3.925188,-137.721997,-140.195234,82.26898,-12.169312,-20.733358,68.245765
00At7PWydsvg7g5xgaYan9,0.24381,0.0,147.044363,0.006746,-1.54928,0.0,-0.508804,0.022868,-0.114338,-0.25726,-3.61308,...,44.677567,67.236451,-29.310542,-13.84633,-101.417791,-60.924994,-106.477247,0.520238,-35.519094,37.291333
00At7PWydsvg7g5xgaYan9,0.41873,0.0,10.416066,-0.028565,10.064054,0.0,1.465751,0.0,1.021409,1.258007,1.88124,...,184.413411,85.769519,9.475446,-174.1243,-83.969069,92.180737,35.541578,61.036413,17.069652,-53.182526
00At7PWydsvg7g5xgaYan9,0.59202,0.0,-1.798167,0.011239,8.668088,0.0,-1.279465,-0.70313,-1.204542,-0.852977,1.763587,...,-345.023342,-196.138551,-219.422512,4.933433,106.276295,-143.663189,-8.356867,67.828944,-49.495706,107.250303
00At7PWydsvg7g5xgaYan9,0.76553,0.0,20.086516,-0.014287,-29.335762,0.0,0.506574,0.068302,0.239057,-0.409813,-2.942683,...,110.490068,-293.078718,125.129489,-6.095964,-41.937504,92.976265,44.322386,-133.826626,97.137003,-17.940691
00At7PWydsvg7g5xgaYan9,0.94122,0.0,5.0671,0.025155,24.601312,0.0,0.337004,0.661973,0.306915,0.355058,-0.096287,...,216.501173,442.805561,293.235843,90.888849,-5.512427,71.174099,110.092074,63.254498,-83.944154,-43.708251
00At7PWydsvg7g5xgaYan9,1.10739,0.0,-18.555168,-0.006921,15.094663,0.0,-0.875136,0.0,-0.343191,-0.01144,0.228794,...,-233.220843,-127.460962,-155.711262,-94.388835,25.396099,-86.23806,-38.219985,19.230109,8.728479,185.15701
00At7PWydsvg7g5xgaYan9,1.28222,0.0,21.987248,0.072723,-2.51878,0.0,0.018938,0.0,-0.126255,-0.113629,0.782779,...,98.813206,-14.279402,133.937251,764.743387,291.818698,-71.826274,-22.208194,-161.328199,318.098605,-303.913894
00At7PWydsvg7g5xgaYan9,1.44063,-0.145529,4.096373,-0.063548,-30.059829,0.0,1.870317,0.0,1.352881,1.573869,1.040263,...,3.864604,-73.804776,-104.263461,-595.434701,-209.874414,229.930469,79.189349,37.794427,-182.310139,14.407373
00At7PWydsvg7g5xgaYan9,1.62617,0.155521,-20.782213,-0.006336,19.382524,0.0,-1.774091,0.0,-1.123207,-1.866252,-3.19682,...,324.9352,381.850124,240.176257,-16.560106,-184.862623,-127.383215,-55.832037,128.14354,-140.372098,124.601117


In [14]:
grouped_dif = diferencias.groupby(level='id')
medianas = grouped_dif.quantile(.5)
iqr = grouped_dif.quantile(.75)-grouped.quantile(.25)
medias = grouped_dif.mean()
desvios = grouped_dif.std()

In [15]:
dif_pitches_timbres_medias_std = pd.merge(medias, desvios, on='id', suffixes=('_dif_media', '_dif_std'))
dif_pitches_timbres_medianas_iqr = pd.merge(medianas, iqr, on='id', suffixes=('_dif_mediana', '_dif_iqr'))
len(dif_pitches_timbres_medias_std)

4177

Y lo unimos todo al otro dataset

In [46]:
df_merged2 = df_merged.select_dtypes(['number']).drop(['disc_number','track_number','release_date'],1)

df_merge_af = df_pitches_timbres_medianas_iqr.merge(
    dif_pitches_timbres_medianas_iqr,
    on='id', 
    how='inner'
)

df_todo = df_merged2.add_prefix('af_').merge(
    df_merge_af.add_prefix('aa_'),
    on='id', 
    how='inner'
)
df_todo['genre']=df_merged['genre']

list(df_todo.columns)

['af_duration_ms',
 'af_popularity',
 'af_danceability',
 'af_energy',
 'af_key',
 'af_loudness',
 'af_mode',
 'af_speechiness',
 'af_acousticness',
 'af_instrumentalness',
 'af_liveness',
 'af_valence',
 'af_tempo',
 'af_time_signature',
 'aa_confidence_mediana',
 'aa_loudness_start_mediana',
 'aa_loudness_max_time_mediana',
 'aa_loudness_max_mediana',
 'aa_loudness_end_mediana',
 'aa_p00_C_mediana',
 'aa_p01_C#_mediana',
 'aa_p02_D_mediana',
 'aa_p03_D#_mediana',
 'aa_p04_E_mediana',
 'aa_p05_F_mediana',
 'aa_p06_F#_mediana',
 'aa_p07_G_mediana',
 'aa_p08_G#_mediana',
 'aa_p09_A_mediana',
 'aa_p10_A#_mediana',
 'aa_p11_B_mediana',
 'aa_t00_mediana',
 'aa_t01_mediana',
 'aa_t02_mediana',
 'aa_t03_mediana',
 'aa_t04_mediana',
 'aa_t05_mediana',
 'aa_t06_mediana',
 'aa_t07_mediana',
 'aa_t08_mediana',
 'aa_t09_mediana',
 'aa_t10_mediana',
 'aa_t11_mediana',
 'aa_confidence_iqr',
 'aa_loudness_start_iqr',
 'aa_loudness_max_time_iqr',
 'aa_loudness_max_iqr',
 'aa_loudness_end_iqr',
 'aa_p

In [47]:
df_todo.to_pickle("../sources/df_todo_0_sin_escalar.pickle")

df_todo.head(3).describe().T.head(30)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
af_duration_ms,3.0,347470.666667,2979.286044,344066.0,346406.0,348746.0,349173.0,349600.0
af_popularity,3.0,36.333333,20.231988,13.0,30.0,47.0,48.0,49.0
af_danceability,3.0,0.631333,0.153155,0.487,0.551,0.615,0.7035,0.792
af_energy,3.0,0.216667,0.078768,0.145,0.1745,0.204,0.2525,0.301
af_key,3.0,7.0,2.0,5.0,6.0,7.0,8.0,9.0
af_loudness,3.0,-14.89,4.491317,-18.67,-17.3725,-16.075,-13.0,-9.925
af_mode,3.0,0.666667,0.57735,0.0,0.5,1.0,1.0,1.0
af_speechiness,3.0,0.045667,0.005853,0.0409,0.0424,0.0439,0.04805,0.0522
af_acousticness,3.0,0.764667,0.189381,0.546,0.709,0.872,0.874,0.876
af_instrumentalness,3.0,0.675,0.253746,0.382,0.6015,0.821,0.8215,0.822


In [70]:
from sklearn.preprocessing import MinMaxScaler

df_para_escalar = df_todo.select_dtypes(['number'])

col_af = [col for col in df_para_escalar if col.startswith('af')]
col_aa = [col for col in df_para_escalar if col.startswith('aa')]

scaler = MinMaxScaler(feature_range=(0,1))
scaled_af = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_af]),columns=col_af,index=df_para_escalar.index)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_aa = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_aa]),columns=col_aa,index=df_para_escalar.index)

scaled_aa.head()

df_escalado = pd.concat([scaled_af, scaled_aa], axis=1)
df_escalado['genre']=df_todo['genre']

df_escalado.to_pickle("../sources/df_todo_1_MinMaxScaled.pickle")

df_escalado.describe().T.head(30)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
af_duration_ms,4082.0,0.071738,0.044974,0.0,0.045713,0.062317,0.085867,1.0
af_popularity,4082.0,0.220605,0.240983,0.0,0.0,0.162791,0.383721,1.0
af_danceability,4082.0,0.467091,0.196402,0.0,0.312896,0.476216,0.613901,1.0
af_energy,4082.0,0.552175,0.340881,0.0,0.217982,0.578913,0.897736,1.0
af_key,4082.0,0.471182,0.321789,0.0,0.181818,0.454545,0.727273,1.0
af_loudness,4082.0,0.736707,0.164114,0.0,0.641785,0.784772,0.864777,1.0
af_mode,4082.0,0.644047,0.47886,0.0,0.0,1.0,1.0,1.0
af_speechiness,4082.0,0.077912,0.066009,0.0,0.044224,0.054492,0.081884,1.0
af_acousticness,4082.0,0.416802,0.420862,0.0,0.001923,0.218876,0.908635,1.0
af_instrumentalness,4082.0,0.387383,0.394439,0.0,0.000741,0.211616,0.841414,1.0


# ESCALAMIENTO DE FEATURES

Y LO NORMALIZAMOS (al menos hasta donde puede hacerse de una manera sencilla)

Primero con MinMaxScaler

Y despues con Normal Quantiles

In [69]:
from sklearn.preprocessing import QuantileTransformer

df_para_escalar = df_todo.select_dtypes(['number'])

col_af = [col for col in df_para_escalar if col.startswith('af')]
col_aa = [col for col in df_para_escalar if col.startswith('aa')]

scaler = QuantileTransformer(n_quantiles=100, output_distribution = 'normal', random_state = 0)
scaled_af = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_af]),columns=col_af,index=df_para_escalar.index)
scaler = QuantileTransformer(n_quantiles=100, output_distribution = 'normal', random_state = 0)
scaled_aa = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_aa]),columns=col_aa,index=df_para_escalar.index)

scaled_aa.head()

df_escalado = pd.concat([scaled_af, scaled_aa], axis=1)
df_escalado['genre']=df_todo['genre']

df_escalado.to_pickle("../sources/df_todo_1_Normalized.pickle")

df_escalado.describe().T.head(30)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
af_duration_ms,4082.0,-0.001851,0.9926864,-5.199338,-0.678479,0.000789,0.67282,5.199338
af_popularity,4082.0,-1.769321,2.929961,-5.199338,-5.199338,0.01266,0.666564,5.199338
af_danceability,4082.0,0.000725,0.9962546,-5.199338,-0.673879,-0.000275,0.671317,5.199338
af_energy,4082.0,-0.000224,1.003355,-5.199338,-0.67449,-0.001582,0.675625,5.199338
af_key,4082.0,-0.173515,2.295738,-5.199338,-0.619855,-0.050661,0.63527,5.199338
af_loudness,4082.0,-0.001218,0.9945427,-5.199338,-0.674745,0.000878,0.676413,5.199338
af_mode,4082.0,1.497898,4.979507,-5.199338,-5.199338,5.199338,5.199338,5.199338
af_speechiness,4082.0,3.7e-05,0.9914065,-5.199338,-0.677141,0.000342,0.675842,5.199338
af_acousticness,4082.0,0.004234,1.022113,-5.199338,-0.673852,-0.00203,0.678468,5.199338
af_instrumentalness,4082.0,-0.273228,1.68235,-5.199338,-0.673508,-0.000762,0.675625,5.199338


In [71]:
from sklearn.preprocessing import QuantileTransformer

df_para_escalar = df_todo.select_dtypes(['number'])

col_af = [col for col in df_para_escalar if col.startswith('af')]
col_aa = [col for col in df_para_escalar if col.startswith('aa')]

scaler = QuantileTransformer(n_quantiles=100, output_distribution = 'uniform', random_state = 0)
scaled_af = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_af]),columns=col_af,index=df_para_escalar.index)
scaler = QuantileTransformer(n_quantiles=100, output_distribution = 'uniform', random_state = 0)
scaled_aa = pd.DataFrame(scaler.fit_transform(df_para_escalar.loc[:, col_aa]),columns=col_aa,index=df_para_escalar.index)

scaled_aa.head()

df_escalado = pd.concat([scaled_af, scaled_aa], axis=1)
df_escalado['genre']=df_todo['genre']

df_escalado.to_pickle("../sources/df_todo_1_Uniform.pickle")

df_escalado.describe().T.head(30)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
af_duration_ms,4082.0,0.499949,0.288585,0.0,0.248734,0.500315,0.749469,1.0
af_popularity,4082.0,0.413886,0.371783,0.0,0.0,0.505051,0.747475,1.0
af_danceability,4082.0,0.499944,0.288671,0.0,0.250194,0.49989,0.74899,1.0
af_energy,4082.0,0.500007,0.288791,0.0,0.25,0.499369,0.750361,1.0
af_key,4082.0,0.495759,0.301745,0.0,0.267677,0.479798,0.737374,1.0
af_loudness,4082.0,0.499943,0.288647,0.0,0.249919,0.50035,0.750611,1.0
af_mode,4082.0,0.644047,0.47886,0.0,0.0,1.0,1.0,1.0
af_speechiness,4082.0,0.500019,0.288623,0.0,0.249158,0.500137,0.75043,1.0
af_acousticness,4082.0,0.500171,0.289,0.0,0.250203,0.49919,0.751263,1.0
af_instrumentalness,4082.0,0.496636,0.294146,0.0,0.250312,0.499696,0.750361,1.0


In [21]:
df_para_escalar = df_todo.select_dtypes(['number'])

quantil=100

df_todo_quantiles_uniform = pd.DataFrame(
    quantile_transform(
        df_para_escalar,
        n_quantiles = quantil,
        output_distribution = 'uniform', 
        random_state = 0, 
        copy = True
    ), columns=df_para_escalar.columns,index=df_para_escalar.index
)
df_todo_quantiles_uniform['genre']=df_todo['genre']

df_todo_quantiles_uniform.to_pickle("../sources/df_todo_3_UniformScaled.pickle")

df_todo_quantiles_uniform.describe().T.head(30)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_ms,4082.0,0.499949,0.288585,0.0,0.248734,0.500315,0.749469,1.0
popularity,4082.0,0.413886,0.371783,0.0,0.0,0.505051,0.747475,1.0
danceability,4082.0,0.499944,0.288671,0.0,0.250194,0.49989,0.74899,1.0
energy,4082.0,0.500007,0.288791,0.0,0.25,0.499369,0.750361,1.0
key,4082.0,0.495759,0.301745,0.0,0.267677,0.479798,0.737374,1.0
loudness,4082.0,0.499943,0.288647,0.0,0.249919,0.50035,0.750611,1.0
mode,4082.0,0.644047,0.47886,0.0,0.0,1.0,1.0,1.0
speechiness,4082.0,0.500019,0.288623,0.0,0.249158,0.500137,0.75043,1.0
acousticness,4082.0,0.500171,0.289,0.0,0.250203,0.49919,0.751263,1.0
instrumentalness,4082.0,0.496636,0.294146,0.0,0.250312,0.499696,0.750361,1.0


In [27]:
df_para_escalar = df_todo.select_dtypes(['number'])

quantil=100

from sklearn.preprocessing import PowerTransformer

transformer = PowerTransformer()

df_todo_quantiles_uniform = pd.DataFrame(
    transformer.fit_transform(
        df_para_escalar
    ), columns=df_para_escalar.columns,index=df_para_escalar.index
)
df_todo_quantiles_uniform['genre']=df_todo['genre']

df_todo_quantiles_uniform.to_pickle("../sources/df_todo_4_RobustScaled.pickle")

df_todo_quantiles_uniform.describe().T.head(30)

  loglike = -n_samples / 2 * np.log(x_trans.var())


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_ms,4082.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
popularity,4082.0,-3.329037e-17,1.000123,-1.120897,-1.120897,0.387648,0.912713,1.560214
danceability,4082.0,2.5484540000000002e-17,1.000123,-2.419822,-0.781548,0.054466,0.75065,2.666903
energy,4082.0,-3.613528e-16,1.000123,-1.598981,-0.983773,0.064734,1.016628,1.326398
key,4082.0,-4.1150600000000004e-17,1.000123,-1.649717,-0.834185,0.067935,0.814126,1.474227
loudness,4082.0,-6.032928e-16,1.000123,-2.571027,-0.777504,0.052242,0.766778,4.327777
mode,4082.0,-1.034123e-15,1.000123,-1.345125,-1.345125,0.743426,0.743426,0.743426
speechiness,4082.0,-4.211341e-16,1.000123,-4.64462,-0.759342,-0.254805,0.683323,2.078511
acousticness,4082.0,-4.966086e-16,1.000123,-1.053993,-1.045835,-0.270137,1.140869,1.263315
instrumentalness,4082.0,-1.195842e-15,1.000123,-1.045863,-1.042498,-0.239238,1.134078,1.353365
