# Supervised Machine Learning: K-Nearest Neighbors (KNN)

## Importar librerías

In [22]:
# ===== Librerías ==========================================
import warnings

import numpy as np
import pandas as pd   

# Gráficos
# Matplotlib
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns
# Plotly
import plotly.express as px # Importing Plotly Express for high-level, easy-to-use interactive visualizations
import plotly.graph_objects as go # Importing Plotly Graph Objects for more fine-grained control over visualizations
from plotly.subplots import make_subplots # Multiple graphs in a figure

# Importando itertools para generar combinaciones de columnas
import itertools

# Importando la función seasonal_decompose para la descomposición de series temporales
from statsmodels.tsa.seasonal import seasonal_decompose

# ===== Ajustes de visualización =============================
#Configuramos para que las gráficas se vean dentro del cuaderno
%matplotlib inline
# Establecemos el tamaño de la fuente en los gráficos
plt.rc('font', size=12)
# Ajustamos el tamaño de las figuras
plt.rc('figure', figsize=(12, 5))
#plt.rcParams['figure.figsize'] = [4, 4]

# Configuration to set so that all the Seaborn figures come out with this size
%config Inlinebackend.figure_format= 'retina'

# Set the Seaborn context to "poster" for larger text and figures
sns.set_context("poster")

# Set the default figure size for Seaborn plots
sns.set(rc={"figure.figsize": (12., 6.)})

# Set the Seaborn style to "whitegrid" for a white background with gridlines
sns.set_style("whitegrid")

## Cargar datos

In [23]:
# Load the dataset
df = pd.read_csv('./datasets/kaggle/spotify_cleaned_dataset.csv')
df.shape

(20594, 24)

## Transformaciones y Extracción de características (_Feature extraction_)

In [24]:
df['Licensed'] = df['Licensed'].replace(df['Licensed'].value_counts().index[2:], 'False')
df.Licensed.value_counts()

Licensed
True     14060
False     6534
Name: count, dtype: int64

In [25]:
df['official_video'] = df['official_video'].replace(df['official_video'].value_counts().index[2:], 'False')
df.official_video.value_counts()

official_video
True     15635
False     4959
Name: count, dtype: int64

In [26]:
df['Licensed'] = df['Licensed'].astype(bool)
df['official_video'] = df['official_video'].astype(bool)
df.dtypes

Artist               object
Track                object
Album                object
Album_type           object
Danceability        float64
Energy              float64
Loudness            float64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Duration_min        float64
Title                object
Channel              object
Views               float64
Likes               float64
Comments            float64
Licensed               bool
official_video         bool
Stream              float64
EnergyLiveness      float64
most_playedon        object
dtype: object

## Prueba de KNN

In [27]:
# mask = df['Album_type']=='single'
# df_knn = df[mask]
# print(df_knn.shape)

# df = df.sample(100).reset_index()
# print(df.shape)
# print(df.index)

df = df.sample(100)
print(df.shape)
print(df.index)

(100, 24)
Index([ 8563, 13350,   504, 15233,  4851, 18228,  1881, 18156, 12937,  1245,
       11670, 10845,  5522, 19854, 18538, 13234,  4677, 12012, 18831, 14446,
       10387, 20167, 11806,  3679, 14972, 15053, 13257,   653, 13694, 17051,
       11600, 13080,  3127,  7118, 15628,  2348, 15904, 17539,  5683,  4230,
       15607, 18932, 15671,  9945, 17794,  6087,  3253,  5012, 10959, 11965,
        1434,  4449, 19177,  3465, 15331,  8552,  3481, 14584, 19032, 16223,
        2221,  3262,  4902, 10355,  6704,  9014,  5133,  4903, 18249, 13968,
       18755, 15515,  3731, 10542,  4850, 12073,  3148,  8486, 19829, 10090,
       11850, 17040,  6011,  9322, 16672,  4129, 19166, 12984,  8293,  2884,
        9967,   815, 16169, 17527,  3141,  7507,   420, 19399, 14319, 16515],
      dtype='int64')


In [31]:
df.loc[:,'escuchada'] = pd.Series(np.zeros(len(df.index)), index=df.index)

# df['escuchada'] = np.zeros(len(df.index))

# df['escuchada'] = pd.Series(np.zeros(len(df.index)), index=df.index)

In [32]:
df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon,escuchada
8563,Beastie Boys,So What'Cha Want,Anthology: The Sounds Of Science,compilation,0.695,0.978,-3.556,0.274,0.0764,3.1e-05,...,BeastieBoysVEVO,36075350.0,235541.0,13561.0,True,True,28842105.0,14.173913,Youtube,0.0
13350,Soundgarden,Fell On Black Days,Superunknown (Deluxe Edition),album,0.574,0.739,-5.745,0.0271,0.00152,0.000328,...,SoundgardenVEVO,60404456.0,356461.0,17537.0,True,True,130040777.0,5.241135,Spotify,0.0
504,Pink Floyd,Breathe (In the Air),The Dark Side of the Moon,album,0.431,0.373,-15.142,0.0346,0.389,0.728,...,Pink Floyd,486448.0,8034.0,238.0,True,True,0.0,2.608392,Youtube,0.0
15233,A$AP Rocky,Potato Salad,Potato Salad,single,0.538,0.612,-6.02,0.424,0.308,0.0,...,ASAPROCKYUPTOWN,65655617.0,883404.0,16705.0,True,True,223928278.0,2.873239,Spotify,0.0
4851,The Chicks,Travelin' Soldier,Home,album,0.569,0.365,-7.047,0.0265,0.691,0.0,...,dixiechicksVEVO,49724135.0,157675.0,9767.0,True,True,84023468.0,2.147059,Spotify,0.0


In [None]:
df_knn = pd.concat([df_knn, pd.Series(valores_normales)], axis=1)
df_knn = df_knn.rename(columns={0: 'num_times_played'})
df_knn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_knn[df_knn['num_times_played']] = pd.Series(valores_normales)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_knn[df_knn['num_times_played']] = pd.Series(valores_normales)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_knn[df_knn['num_times_played']] = pd.Series(valores_normales)
A value is

In [None]:
df_knn

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon,num_times_played
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.930,0.0522,0.042500,0.046900,...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063467.0,7.956897,Spotify,10.0
6,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown) ...,New Gold (feat. Tame Impala and Bootie Brown) ...,single,0.716,0.897,-7.185,0.0629,0.012000,0.262000,...,Dom Dolla,451996.0,11686.0,241.0,True,True,10666154.0,2.760000,Spotify,22.0
8,Gorillaz,Cracker Island (feat. Thundercat),Cracker Island (feat. Thundercat),single,0.741,0.913,-3.340,0.0465,0.003430,0.103000,...,Gorillaz,24459820.0,739527.0,20296.0,True,True,42671901.0,2.809231,Spotify,28.0
29,50 Cent,Best Friend - Remix,Best Friend (Remix),single,0.545,0.640,-3.529,0.3080,0.368000,0.000000,...,Lightning,291023.0,5729.0,46.0,True,True,160037992.0,5.765766,Spotify,22.0
37,Metallica,Lux Æterna,Lux Æterna,single,0.386,0.996,-2.960,0.0754,0.000012,0.000836,...,Metallica,14937918.0,473751.0,46230.0,True,True,20061385.0,8.586207,Spotify,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4968,,,,,,,,,,,...,,,,,,,,,,21.0
4969,,,,,,,,,,,...,,,,,,,,,,12.0
4970,,,,,,,,,,,...,,,,,,,,,,28.0
4971,,,,,,,,,,,...,,,,,,,,,,23.0


In [45]:
counts = df_knn[['Artist']].value_counts()
counts

Artist              
DVRST                   10
Prezioso                10
The Living Tombstone    10
Prashant Katheriya      10
TheFatRat               10
                        ..
Ha*Ash                   1
Haftbefehl               1
Rick Astley              1
Hank Williams, Jr.       1
Fabolous                 1
Name: count, Length: 1350, dtype: int64

In [50]:
df_knn[df_knn['Artist']=='Rick Astley']

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon,num_times_played
9084,Rick Astley,Love this Christmas,Love this Christmas,single,0.483,0.847,-3.57,0.0419,0.116,0.0,...,Rick Astley,1769326.0,120756.0,15474.0,True,True,5723679.0,11.980198,Spotify,


In [52]:
# Dejaremos a cero los artistas que no nos gustan
df_knn['num_times_played'] = df_knn.apply(lambda row: 0 if row['Artist'] in ['Jonas Brothers','Rick Astley','Hank Williams, Jr.','Haftbefehl','The Living Tombstone','Prashant Katheriya','Prezioso','DVRST','TheFatRat'] else row['num_times_played'], axis=1)

In [53]:
df_knn.describe()

Unnamed: 0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Views,Likes,Comments,Stream,EnergyLiveness,num_times_played
count,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,4973.0,5012.0
mean,0.665431,0.668173,-6.521096,0.104177,0.264973,0.043085,0.188658,0.529853,121.229033,3.482984,80678420.0,695142.2,26434.38,98961870.0,5.440672,24.144254
std,0.145695,0.187678,3.35938,0.0964,0.261471,0.162262,0.15861,0.231061,27.866394,1.631099,243630900.0,1831535.0,134835.7,196946700.0,4.11599,7.513796
min,0.0,0.00342,-36.062,0.0,6e-06,0.0,0.0145,0.0,0.0,1.002,0.0,0.0,0.0,0.0,0.030811,0.0
25%,0.58,0.554,-7.645,0.041,0.0483,0.0,0.0951,0.351,99.238,2.778667,1042952.0,19005.0,443.0,8057545.0,2.6875,19.0
50%,0.683,0.692,-5.868,0.0619,0.173,1e-06,0.125,0.535,121.066,3.291233,10028450.0,130993.0,2806.0,30846530.0,4.628319,24.0
75%,0.774,0.811,-4.561,0.129,0.417,0.000354,0.228,0.714,137.506,3.864217,60872430.0,591839.0,14893.0,96301700.0,7.048544,29.0
max,0.975,0.997,0.92,0.885,0.996,1.0,0.984,0.981,236.059,68.670967,5773798000.0,40147670.0,5331537.0,2456205000.0,59.113924,49.0


In [56]:
columnas_numericas=df_knn.describe().columns.tolist()
print(columnas_numericas)

['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_min', 'Views', 'Likes', 'Comments', 'Stream', 'EnergyLiveness', 'num_times_played']


In [61]:
# df_knn.isna().sum()
df_knn.shape

(9354, 25)

In [57]:
#Check the data format.
subset = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_min', 'Views', 'Likes', 'Comments', 'Stream', 'EnergyLiveness']
X = df_knn[subset]
y = df_knn['num_times_played']

In [58]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=10)
knn.fit(X,y)

ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values