# Pre-procesamiento

## Importar librerías

In [1]:
# ===== Librerías ==========================================
import warnings

import numpy as np
import pandas as pd   

# Gráficos
# Matplotlib
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns
# Plotly
import plotly.express as px # Importing Plotly Express for high-level, easy-to-use interactive visualizations
import plotly.graph_objects as go # Importing Plotly Graph Objects for more fine-grained control over visualizations
from plotly.subplots import make_subplots # Multiple graphs in a figure

# Importando itertools para generar combinaciones de columnas
import itertools

# Importando la función seasonal_decompose para la descomposición de series temporales
from statsmodels.tsa.seasonal import seasonal_decompose

# ===== Ajustes de visualización =============================
#Configuramos para que las gráficas se vean dentro del cuaderno
%matplotlib inline
# Establecemos el tamaño de la fuente en los gráficos
plt.rc('font', size=12)
# Ajustamos el tamaño de las figuras
plt.rc('figure', figsize=(12, 5))
#plt.rcParams['figure.figsize'] = [4, 4]

# Configuration to set so that all the Seaborn figures come out with this size
%config Inlinebackend.figure_format= 'retina'

# Set the Seaborn context to "poster" for larger text and figures
sns.set_context("poster")

# Set the default figure size for Seaborn plots
sns.set(rc={"figure.figsize": (12., 6.)})

# Set the Seaborn style to "whitegrid" for a white background with gridlines
sns.set_style("whitegrid")

## Cargar datos

In [2]:
# Load the dataset
df = pd.read_csv('./datasets/kaggle/spotify_cleaned_dataset.csv')
df.shape

(20594, 24)

## Transformaciones y Extracción de características (_Feature extraction_)

### Column 'Licensed'

In [3]:
df['Licensed'] = df['Licensed'].replace(df['Licensed'].value_counts().index[2:], 'False')
df.Licensed.value_counts()

Licensed
True     14060
False     6534
Name: count, dtype: int64

### Column 'official_video'

In [4]:
df['official_video'] = df['official_video'].replace(df['official_video'].value_counts().index[2:], 'False')
df.official_video.value_counts()

official_video
True     15635
False     4959
Name: count, dtype: int64

### Conversion to Boolean

In [5]:
df['Licensed'] = df['Licensed'].astype(bool)
df['official_video'] = df['official_video'].astype(bool)
df.dtypes

Artist               object
Track                object
Album                object
Album_type           object
Danceability        float64
Energy              float64
Loudness            float64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Duration_min        float64
Title                object
Channel              object
Views               float64
Likes               float64
Comments            float64
Licensed               bool
official_video         bool
Stream              float64
EnergyLiveness      float64
most_playedon        object
dtype: object

## Tratamiento de valores faltantes

In [6]:
df.isna().sum()

Artist              0
Track               0
Album               0
Album_type          0
Danceability        0
Energy              0
Loudness            0
Speechiness         0
Acousticness        0
Instrumentalness    0
Liveness            0
Valence             0
Tempo               0
Duration_min        0
Title               0
Channel             0
Views               0
Likes               0
Comments            0
Licensed            0
official_video      0
Stream              0
EnergyLiveness      2
most_playedon       0
dtype: int64

In [9]:
df[df['EnergyLiveness'].isna()]

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
11824,Natasha Bedingfield,These Words,Unwritten,album,0.0,0.0,0.0,0.0,0.0,0.0,...,Natasha Bedingfield - These Words (Official Vi...,NBedingfieldVEVO,21655597.0,165220.0,7780.0,True,True,110442210.0,,Spotify
13773,White Noise for Babies,Rain in the Early Morning,Soothing Rain for Background Sounds and Natura...,album,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,True,True,145339552.0,,Spotify


In [None]:
df = df.dropna()

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.1770,0.008360,0.002330,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,True,True,1.040235e+09,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.086900,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,3.100837e+08,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.930,0.0522,0.042500,0.046900,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,True,True,6.306347e+07,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.810,0.0260,0.000015,0.509000,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,True,True,4.346636e+08,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.1710,0.025300,0.000000,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,True,True,6.172597e+08,9.942693,Youtube
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20589,SICK LEGEND,JUST DANCE HARDSTYLE,JUST DANCE HARDSTYLE,single,0.582,0.926,-6.344,0.0328,0.448000,0.000000,...,JUST DANCE HARDSTYLE,SICK LEGEND - Topic,71678.0,1113.0,0.0,True,True,9.227144e+06,11.036949,Spotify
20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE,SET FIRE TO THE RAIN HARDSTYLE,single,0.531,0.936,-1.786,0.1370,0.028000,0.000000,...,SET FIRE TO THE RAIN HARDSTYLE,SICK LEGEND - Topic,164741.0,2019.0,0.0,True,True,1.089818e+07,10.140845,Spotify
20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP,OUTSIDE HARDSTYLE SPED UP,single,0.443,0.830,-4.679,0.0647,0.024300,0.000000,...,OUTSIDE HARDSTYLE SPED UP,SICK LEGEND - Topic,35646.0,329.0,0.0,True,True,6.226110e+06,5.389610,Spotify
20592,SICK LEGEND,ONLY GIRL HARDSTYLE,ONLY GIRL HARDSTYLE,single,0.417,0.767,-4.004,0.4190,0.356000,0.018400,...,ONLY GIRL HARDSTYLE,SICK LEGEND - Topic,6533.0,88.0,0.0,True,True,6.873961e+06,7.101852,Spotify


## Tratamiento de valores atípicos