# Configuração

Instala as libs que serão utilizadas na análise dos dados

In [1]:
!pip install --upgrade feather-format -q
!pip install --upgrade pandas -q
!pip install --upgrade seaborn -q
!pip install --upgrade spotipy -q

[K    100% |████████████████████████████████| 12.5MB 2.9MB/s 
[?25h  Building wheel for feather-format (setup.py) ... [?25ldone
[K    100% |████████████████████████████████| 10.1MB 3.7MB/s 
[31mpymc3 3.6 has requirement joblib<0.13.0, but you'll have joblib 0.13.2 which is incompatible.[0m
[K    100% |████████████████████████████████| 215kB 25.4MB/s 
[?25h  Building wheel for spotipy (setup.py) ... [?25ldone
[?25h

# Extração

### Importações de bibliotecas iniciais

In [0]:
import requests
from requests.exceptions import BaseHTTPError
import io
import pandas as pd
import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data

### Função que recebe o link para recuperar os dados e retorna um data frame

Foi criado o tipo pois para o chart a primeira linha era o cabeçalho, porém para o viral não. Dessa forma a fim de utilizar a mesma função para ambos os casos foi criado o parâmetro que recebe 1 para chart e 2 para viral.

In [0]:
def pandazize(date, tipo='chart'):
  # verifica se é tipo = 1 (chart)
  if tipo == 'chart':
    link = 'https://spotifycharts.com/regional/br/daily/' + str(date)[:10] + '/download'
    header = 1
  elif tipo == 'viral':
    link = 'https://spotifycharts.com/viral/br/daily/' + str(date)[:10] + '/download'
    header = 0
  else:
    raise ValueError(tipo)
  response = requests.get(link)
  if response.ok:
    file_object = io.StringIO(response.content.decode('utf-8'))
    return pd.read_csv(file_object, header=header)
  else:
    return pd.DataFrame()#@RUI NÃO MEXE NESSA PORRA
  #raise BaseHTTPError(response)

### Gera as datas que serão utilizadas para recuperar os dados
As datas que serão utilizadas para recuperar os dados dos charts e dos virais são geradas abaixo e armazenadas em uma lista (listDate).

In [0]:
startDate = datetime.datetime.strptime("01-01-2017", "%d-%m-%Y")
#endDate = datetime.datetime.strptime("01-02-2017", "%d-%m-%Y")
endDate   = datetime.datetime.today()

# foi retirado -1 do resultado da expressão "int ((endDate - startDate).days)" pois só existe chart para download no spotify até o dia anterior
listDate = [startDate + datetime.timedelta(n) for n in range(int ((endDate - startDate).days)-1)]
listDate[:3]

[datetime.datetime(2017, 1, 1, 0, 0),
 datetime.datetime(2017, 1, 2, 0, 0),
 datetime.datetime(2017, 1, 3, 0, 0)]

### Criação de dois data frames finais com os dados dos charts e virais

Os dois data frames finais com os dados do chart (pandao) e do viral (pandaoV) foram criados a partir da união de data frames diários gerados a cada iteração do loop de data. Foram adicionados para cada um dos dois data frames finais três atributos como forma de enriquecimento, sendo eles: dois de com as datas de referência dos dados (um com o tipo string no formato YYYY-MM-DD e o outro do tipo int no formato YYYYMMDD) e um com o track_id (gerado a partir do atributo de URL). A intenção da criação deste último é para realizar a ligação com a base de dados que contém as features de cada música.

In [0]:
# são criadas as listas que serão utilizadas para armazenar todos os data frames gerados com os dados dos charts e dos virais
spotifyCharts = []
spotifyViral = []

# criação dos data frames finais
pandao = pd.DataFrame()
pandaoV = pd.DataFrame()

# realiza um loop com todas as datas criadas a partir do startDate e endDate
for date in listDate:
  # realiza a requisição de download com concatenando a data no formato esperado
  # data frames criados a partir da função pandazize com os dados dos charts e virais diários de acordo com a data de iteração do loop
  pddia = pandazize(date, tipo='chart')
  pdviral = pandazize(date, tipo='viral')
  
  # adição da data do chart do tipo string no formato YYYY-MM-DD e do typo inteiro no formato YYYYMMDD
  pddia['data_chart'] = str(date)[:10]
  pddia['data_chart_int'] = int(date.strftime('%Y%m%d'))
  
  # adição da data do viral do tipo string no formato YYYY-MM-DD e do typo inteiro no formato YYYYMMDD
  pdviral['data_viral'] = str(date)[:10]
  pdviral['data_viral_int'] = int(date.strftime('%Y%m%d'))
  
  # adição dos data frames diários nas listas de chars e virais
  spotifyCharts.append(pddia)
  spotifyViral.append(pdviral)
  print('processado dia {}'.format(str(date)[:10]))

# para criar um único data frame foi utilizada a função concat do pandas que recebe uma lista de data frames e une tudo em único data frame
# criação de um data frame único a partir dos data frames diários de charts e do atriuto track_id que pega a string a partir do último "/" da URL
pandao = pd.concat(spotifyCharts)
pandao['track_id'] = pandao['URL'].str.slice(start=31, stop=53, step=1)

# criação de um data frame único a partir dos data frames diários de virais e do atriuto track_id que pega a string a partir do último "/" da URL
pandaoV = pd.concat(spotifyViral)
pandaoV['track_id'] = pandaoV['URL'].str.slice(start=31, stop=53, step=1)

###Conexão com Spotipy

In [0]:
client_id = "93c31efae31c4c8ea0e5ceacad67de30"
client_secret = "859960b852c04e03a1c18e34b8428b38"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

###Criação do DataFrame com Features

In [0]:
listIdTrack = pd.concat([pandao['track_id'], pandaoV['track_id']]).unique()

spotifyFeatures = []

for x in listIdTrack:
  pdtrack = sp.audio_features(x)
  spotifyFeatures.extend(pdtrack)
   
features = pd.DataFrame(spotifyFeatures)

#features.head()

### Inclusão dos data frames finais em arquivos feather
Foram utilizados arquivos de extensão .feather com o intuito de armazenar os dados em estruturas que ocupem menos espaços

In [0]:
pandao.reset_index().to_feather('pandao.feather')
pandaoV.reset_index().to_feather('pandaoV.feather')
features.reset_index().to_feather('features.feather')

### Upload dos arquivos com extensão .feather no google drive
Após a criação dos arquivos com extensão .feather que possuem os dados de chart e viral é realizado o upload dos mesmos no google drive para que sejam utilizados posteriormente, sem a necessidade de todo o processo anterior de importação dos dados seja realizado sempre que o trabalho for iniciado

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload pandao.feather file.
uploaded = drive.CreateFile({'title': 'pandao.feather'})
uploaded.SetContentFile('pandao.feather')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

# Create & upload pandaoV.feather file.
uploaded2 = drive.CreateFile({'title': 'pandaoV.feather'})
uploaded2.SetContentFile('pandaoV.feather')
uploaded2.Upload()
print('Uploaded file with ID {}'.format(uploaded2.get('id')))

# Create & upload features.feather file.
uploaded3 = drive.CreateFile({'title': 'features.feather'})
uploaded3.SetContentFile('features.feather')
uploaded3.Upload()
print('Uploaded file with ID {}'.format(uploaded3.get('id')))

[?25l[K    1% |▎                               | 10kB 15.4MB/s eta 0:00:01[K    2% |▋                               | 20kB 3.4MB/s eta 0:00:01[K    3% |█                               | 30kB 4.8MB/s eta 0:00:01[K    4% |█▎                              | 40kB 3.1MB/s eta 0:00:01[K    5% |█▋                              | 51kB 3.7MB/s eta 0:00:01[K    6% |██                              | 61kB 4.4MB/s eta 0:00:01[K    7% |██▎                             | 71kB 5.0MB/s eta 0:00:01[K    8% |██▋                             | 81kB 5.6MB/s eta 0:00:01[K    9% |███                             | 92kB 6.3MB/s eta 0:00:01[K    10% |███▎                            | 102kB 4.8MB/s eta 0:00:01[K    11% |███▋                            | 112kB 4.9MB/s eta 0:00:01[K    12% |████                            | 122kB 6.5MB/s eta 0:00:01[K    13% |████▎                           | 133kB 6.5MB/s eta 0:00:01[K    14% |████▋                           | 143kB 11.6MB/s eta 0:00:01

# Data Loading

Este passo carrega os arquivos feather gerados na etapa de extração, para que a mesma não tenha que ser executada novamente

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz

charts_file_id = '15wds_Fd6lKaajtuX64fL8GRwBvh9eMzW'
viral_file_id = '1nw1Xkkmt4_sgdJyeLV1HV5oWhiB-z0k4'
features_file_id = '1RkvHxSrpANn_JzZoCw3AKnvTcfS2yYpC'

In [0]:
#define a função que salva localmente o arquivo do drive
def download_feather(file_id,filename):
  download = drive.CreateFile({'id': file_id})
  download.GetContentFile(filename)
  return None
  

In [0]:
#Realiza o download dos arquivos
download_feather(charts_file_id,'charts.feather')
download_feather(viral_file_id,'viral.feather')
download_feather(features_file_id,'features.feather')


In [0]:
#cria os dataframes com os arquivos salvos
import feather

charts = feather.read_dataframe('charts.feather')
viral = feather.read_dataframe('viral.feather')
features = feather.read_dataframe('features.feather')

In [0]:
#geracao manual de um dataframe com as datas referentes aos carnavais
#carnaval 2017: 10/02 - 14/02 (sexta à quarta-feira)
#carnaval 2018: 09/02 - 14/02 (sexta à quarta-feira)
#carnaval 2019: 01/03 - 06/03 (sexta à quarta-feira)
carnaval = pd.Series([20170210,20170211,20170212,20170213,20170214,
                      20180209,20180210,20180211,20180212,20180213,20180214,
                      20190301,20190302,20190303,20190304,20190305,20190306])


# Data Analysis

Iremos agora analisar os dados de entrada

In [0]:
import pandas_profiling
from IPython.display import display, HTML

def generate_profiling(dataframe):
  report = pandas_profiling.ProfileReport(dataframe)
  bootstrap_html = """
  <!doctype html>
    <html lang="en">
    <head>
      <meta charset="utf-8">
      <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.0/jquery.min.js"></script> 
      <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js"></script>
     <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.bundle.min.js"></script>
     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" crossorigin="anonymous">
    </head>
  </html>
  """
  display(HTML(bootstrap_html + report.html))

In [49]:
generate_profiling(features)

0,1
Number of variables,19
Number of observations,5198
Total Missing (%),0.0%
Total size in memory,771.7 KiB
Average record size in memory,152.0 B

0,1
Numeric,13
Categorical,0
Boolean,1
Date,0
Text (Unique),4
Rejected,1
Unsupported,0

0,1
Distinct count,1933
Unique (%),37.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.27116
Minimum,3.63e-06
Maximum,0.995
Zeros (%),0.0%

0,1
Minimum,3.63e-06
5-th percentile,0.00263
Q1,0.045525
Median,0.176
Q3,0.45
95-th percentile,0.817
Maximum,0.995
Range,0.995
Interquartile range,0.40448

0,1
Standard deviation,0.26733
Coef of variation,0.98587
Kurtosis,-0.32346
Mean,0.27116
MAD,0.22453
Skewness,0.91312
Sum,1409.5
Variance,0.071467
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.104,19,0.4%,
0.172,16,0.3%,
0.113,16,0.3%,
0.118,15,0.3%,
0.111,14,0.3%,
0.117,14,0.3%,
0.12,13,0.3%,
0.125,12,0.2%,
0.177,12,0.2%,
0.493,12,0.2%,

Value,Count,Frequency (%),Unnamed: 3
3.63e-06,1,0.0%,
4.51e-06,1,0.0%,
4.99e-06,1,0.0%,
5.94e-06,1,0.0%,
6.11e-06,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.985,1,0.0%,
0.986,2,0.0%,
0.987,1,0.0%,
0.994,1,0.0%,
0.995,1,0.0%,

First 3 values
https://api.spotify.com/v1/audio-analysis/71P1...
https://api.spotify.com/v1/audio-analysis/4iZC...
https://api.spotify.com/v1/audio-analysis/4mtH...

Last 3 values
https://api.spotify.com/v1/audio-analysis/08bN...
https://api.spotify.com/v1/audio-analysis/1x5s...
https://api.spotify.com/v1/audio-analysis/3K5S...

Value,Count,Frequency (%),Unnamed: 3
https://api.spotify.com/v1/audio-analysis/000xQL6tZNLJzIrtIgxqSl,1,0.0%,
https://api.spotify.com/v1/audio-analysis/000xYdQfIZ4pDmBGzQalKU,1,0.0%,
https://api.spotify.com/v1/audio-analysis/003eoIwxETJujVWmNFMoZy,1,0.0%,
https://api.spotify.com/v1/audio-analysis/006pMMCuRo2TFjh8sNGSov,1,0.0%,
https://api.spotify.com/v1/audio-analysis/007QSAaELpVxtX5Z6dMn0U,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
https://api.spotify.com/v1/audio-analysis/7zmleW3XZx0uUsL2CkFuDe,1,0.0%,
https://api.spotify.com/v1/audio-analysis/7zoLdBiyyafD9ixMHo0XP3,1,0.0%,
https://api.spotify.com/v1/audio-analysis/7zvGNZu33bC4jZEkXHdDZ6,1,0.0%,
https://api.spotify.com/v1/audio-analysis/7zwINh1X8wEhDCJLlSMjLF,1,0.0%,
https://api.spotify.com/v1/audio-analysis/7zxRMhXxJMQCeDDg0rKAVo,1,0.0%,

0,1
Distinct count,728
Unique (%),14.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.64895
Minimum,0.0861
Maximum,0.981
Zeros (%),0.0%

0,1
Minimum,0.0861
5-th percentile,0.36685
Q1,0.557
Median,0.663
Q3,0.754
95-th percentile,0.88
Maximum,0.981
Range,0.8949
Interquartile range,0.197

0,1
Standard deviation,0.15111
Coef of variation,0.23286
Kurtosis,-0.0046753
Mean,0.64895
MAD,0.12043
Skewness,-0.4363
Sum,3373.2
Variance,0.022835
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.631,24,0.5%,
0.671,23,0.4%,
0.636,23,0.4%,
0.673,22,0.4%,
0.654,22,0.4%,
0.592,21,0.4%,
0.669,21,0.4%,
0.618,21,0.4%,
0.596,21,0.4%,
0.714,21,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0861,1,0.0%,
0.145,1,0.0%,
0.154,1,0.0%,
0.155,2,0.0%,
0.159,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.968,1,0.0%,
0.971,1,0.0%,
0.974,1,0.0%,
0.98,1,0.0%,
0.981,1,0.0%,

0,1
Distinct count,4522
Unique (%),87.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,211990
Minimum,36393
Maximum,1355938
Zeros (%),0.0%

0,1
Minimum,36393
5-th percentile,144990
Q1,180020
Median,205070
Q3,232760
95-th percentile,303780
Maximum,1355938
Range,1319545
Interquartile range,52747

0,1
Standard deviation,57173
Coef of variation,0.2697
Kurtosis,43.568
Mean,211990
MAD,37637
Skewness,3.5372
Sum,1101933914
Variance,3268800000
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
180000,8,0.2%,
187147,6,0.1%,
199849,6,0.1%,
192000,5,0.1%,
224000,5,0.1%,
210387,5,0.1%,
217846,5,0.1%,
204347,5,0.1%,
194840,4,0.1%,
264038,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
36393,1,0.0%,
37640,1,0.0%,
39530,1,0.0%,
48720,1,0.0%,
49322,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
630508,1,0.0%,
737705,1,0.0%,
738667,1,0.0%,
952203,1,0.0%,
1355938,1,0.0%,

0,1
Distinct count,812
Unique (%),15.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.65787
Minimum,0.00611
Maximum,0.999
Zeros (%),0.0%

0,1
Minimum,0.00611
5-th percentile,0.308
Q1,0.536
Median,0.681
Q3,0.805
95-th percentile,0.929
Maximum,0.999
Range,0.99289
Interquartile range,0.269

0,1
Standard deviation,0.18889
Coef of variation,0.28712
Kurtosis,-0.18342
Mean,0.65787
MAD,0.15403
Skewness,-0.53899
Sum,3419.6
Variance,0.035679
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.713,21,0.4%,
0.7,19,0.4%,
0.738,19,0.4%,
0.777,18,0.3%,
0.68,17,0.3%,
0.873,17,0.3%,
0.716,17,0.3%,
0.822,16,0.3%,
0.75,16,0.3%,
0.794,16,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0.00611,1,0.0%,
0.0136,1,0.0%,
0.0217,1,0.0%,
0.0227,1,0.0%,
0.0281,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.993,1,0.0%,
0.995,3,0.1%,
0.996,2,0.0%,
0.998,1,0.0%,
0.999,1,0.0%,

First 3 values
2paiJvhtP3ZZLMxbBtmBA3
0uwaiApk6k7k9POyFjTKeR
0HVv5bEOiI9a0QfgcASXpX

Last 3 values
0bJzyiMBy2hpqhgf4RN20J
0OMt4Qyetdbw2eUEvKBdHH
1Spmq0UazGU6ovMIlGldZX

Value,Count,Frequency (%),Unnamed: 3
000xQL6tZNLJzIrtIgxqSl,1,0.0%,
000xYdQfIZ4pDmBGzQalKU,1,0.0%,
003eoIwxETJujVWmNFMoZy,1,0.0%,
006pMMCuRo2TFjh8sNGSov,1,0.0%,
007QSAaELpVxtX5Z6dMn0U,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
7zmleW3XZx0uUsL2CkFuDe,1,0.0%,
7zoLdBiyyafD9ixMHo0XP3,1,0.0%,
7zvGNZu33bC4jZEkXHdDZ6,1,0.0%,
7zwINh1X8wEhDCJLlSMjLF,1,0.0%,
7zxRMhXxJMQCeDDg0rKAVo,1,0.0%,

0,1
Distinct count,5198
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2598.5
Minimum,0
Maximum,5197
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,259.85
Q1,1299.2
Median,2598.5
Q3,3897.8
95-th percentile,4937.1
Maximum,5197.0
Range,5197.0
Interquartile range,2598.5

0,1
Standard deviation,1500.7
Coef of variation,0.57752
Kurtosis,-1.2
Mean,2598.5
MAD,1299.5
Skewness,0
Sum,13507003
Variance,2252000
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
4691,1,0.0%,
4667,1,0.0%,
2620,1,0.0%,
573,1,0.0%,
4671,1,0.0%,
2624,1,0.0%,
577,1,0.0%,
4675,1,0.0%,
2628,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
5193,1,0.0%,
5194,1,0.0%,
5195,1,0.0%,
5196,1,0.0%,
5197,1,0.0%,

0,1
Distinct count,1698
Unique (%),32.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.022032
Minimum,0
Maximum,0.961
Zeros (%),53.4%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,9.0775e-05
95-th percentile,0.074235
Maximum,0.961
Range,0.961
Interquartile range,9.0775e-05

0,1
Standard deviation,0.11099
Coef of variation,5.0379
Kurtosis,41.931
Mean,0.022032
MAD,0.039757
Skewness,6.3311
Sum,114.52
Variance,0.01232
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,2775,53.4%,
1.68e-06,6,0.1%,
1.3e-05,6,0.1%,
0.00025,5,0.1%,
1.96e-06,5,0.1%,
0.00115,5,0.1%,
2.45e-06,5,0.1%,
1.12e-05,5,0.1%,
0.0107,5,0.1%,
2.55e-06,5,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2775,53.4%,
1e-06,2,0.0%,
1.01e-06,1,0.0%,
1.03e-06,2,0.0%,
1.04e-06,3,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.943,1,0.0%,
0.945,1,0.0%,
0.959,1,0.0%,
0.96,1,0.0%,
0.961,1,0.0%,

0,1
Distinct count,12
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.3503
Minimum,0
Maximum,11
Zeros (%),11.1%

0,1
Minimum,0
5-th percentile,0
Q1,2
Median,6
Q3,9
95-th percentile,11
Maximum,11
Range,11
Interquartile range,7

0,1
Standard deviation,3.6245
Coef of variation,0.67744
Kurtosis,-1.3104
Mean,5.3503
MAD,3.1784
Skewness,-0.010544
Sum,27811
Variance,13.137
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1,585,11.3%,
0,578,11.1%,
7,519,10.0%,
11,499,9.6%,
9,490,9.4%,
2,484,9.3%,
6,408,7.8%,
8,408,7.8%,
5,386,7.4%,
4,377,7.3%,

Value,Count,Frequency (%),Unnamed: 3
0,578,11.1%,
1,585,11.3%,
2,484,9.3%,
3,152,2.9%,
4,377,7.3%,

Value,Count,Frequency (%),Unnamed: 3
7,519,10.0%,
8,408,7.8%,
9,490,9.4%,
10,312,6.0%,
11,499,9.6%,

0,1
Distinct count,1152
Unique (%),22.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21253
Minimum,0.0211
Maximum,0.99
Zeros (%),0.0%

0,1
Minimum,0.0211
5-th percentile,0.0626
Q1,0.0976
Median,0.131
Q3,0.264
95-th percentile,0.67815
Maximum,0.99
Range,0.9689
Interquartile range,0.1664

0,1
Standard deviation,0.19281
Coef of variation,0.90721
Kurtosis,4.4917
Mean,0.21253
MAD,0.13726
Skewness,2.155
Sum,1104.7
Variance,0.037175
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.111,59,1.1%,
0.109,55,1.1%,
0.106,55,1.1%,
0.101,53,1.0%,
0.104,53,1.0%,
0.11,52,1.0%,
0.108,50,1.0%,
0.112,50,1.0%,
0.102,49,0.9%,
0.115,45,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0.0211,1,0.0%,
0.0215,1,0.0%,
0.0217,1,0.0%,
0.0219,2,0.0%,
0.0232,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.982,2,0.0%,
0.984,3,0.1%,
0.985,2,0.0%,
0.989,2,0.0%,
0.99,1,0.0%,

0,1
Distinct count,3829
Unique (%),73.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-6.381
Minimum,-29.25
Maximum,1.102
Zeros (%),0.0%

0,1
Minimum,-29.25
5-th percentile,-11.53
Q1,-7.7168
Median,-5.871
Q3,-4.5113
95-th percentile,-2.7604
Maximum,1.102
Range,30.352
Interquartile range,3.2055

0,1
Standard deviation,2.8818
Coef of variation,-0.45163
Kurtosis,4.6065
Mean,-6.381
MAD,2.1254
Skewness,-1.4511
Sum,-33168
Variance,8.305
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
-4.426,6,0.1%,
-6.405,6,0.1%,
-2.729,6,0.1%,
-4.661,5,0.1%,
-6.323,5,0.1%,
-7.094,5,0.1%,
-4.673,5,0.1%,
-4.218,5,0.1%,
-5.874,5,0.1%,
-5.126,5,0.1%,

Value,Count,Frequency (%),Unnamed: 3
-29.25,1,0.0%,
-27.431,1,0.0%,
-25.652,1,0.0%,
-24.79,1,0.0%,
-24.235,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.55,1,0.0%,
0.612,1,0.0%,
0.706,1,0.0%,
1.001,1,0.0%,
1.102,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.58773

0,1
1,3055
0,2143

Value,Count,Frequency (%),Unnamed: 3
1,3055,58.8%,
0,2143,41.2%,

0,1
Distinct count,1103
Unique (%),21.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.10726
Minimum,0.0221
Maximum,0.928
Zeros (%),0.0%

0,1
Minimum,0.0221
5-th percentile,0.0296
Q1,0.0403
Median,0.0626
Q3,0.129
95-th percentile,0.342
Maximum,0.928
Range,0.9059
Interquartile range,0.0887

0,1
Standard deviation,0.10588
Coef of variation,0.98718
Kurtosis,6.5866
Mean,0.10726
MAD,0.075435
Skewness,2.3107
Sum,557.51
Variance,0.011211
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.103,21,0.4%,
0.108,20,0.4%,
0.129,19,0.4%,
0.0376,18,0.3%,
0.105,18,0.3%,
0.0349,17,0.3%,
0.0382,17,0.3%,
0.117,16,0.3%,
0.0303,16,0.3%,
0.112,16,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0.0221,1,0.0%,
0.0232,1,0.0%,
0.0235,2,0.0%,
0.0239,1,0.0%,
0.0242,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.75,2,0.0%,
0.752,1,0.0%,
0.765,2,0.0%,
0.869,1,0.0%,
0.928,1,0.0%,

0,1
Distinct count,4455
Unique (%),85.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,122.89
Minimum,46.718
Maximum,212.12
Zeros (%),0.0%

0,1
Minimum,46.718
5-th percentile,80.134
Q1,99.999
Median,122.74
Q3,139.99
95-th percentile,175.7
Maximum,212.12
Range,165.4
Interquartile range,39.995

0,1
Standard deviation,28.137
Coef of variation,0.22897
Kurtosis,-0.30198
Mean,122.89
MAD,22.544
Skewness,0.37153
Sum,638760
Variance,791.68
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
119.993,6,0.1%,
129.961,6,0.1%,
129.957,5,0.1%,
124.008,5,0.1%,
110.001,5,0.1%,
100.048,5,0.1%,
124.982,5,0.1%,
120.019,4,0.1%,
119.958,4,0.1%,
79.938,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
46.718,1,0.0%,
48.702,1,0.0%,
50.7,1,0.0%,
50.986,1,0.0%,
51.444,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
207.573,1,0.0%,
211.6,1,0.0%,
211.974,1,0.0%,
212.058,1,0.0%,
212.117,1,0.0%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.9708
Minimum,1
Maximum,5
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,4
Q1,4
Median,4
Q3,4
95-th percentile,4
Maximum,5
Range,4
Interquartile range,0

0,1
Standard deviation,0.29016
Coef of variation,0.073075
Kurtosis,33.033
Mean,3.9708
MAD,0.096659
Skewness,-3.3526
Sum,20640
Variance,0.084194
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
4,4860,93.5%,
3,219,4.2%,
5,106,2.0%,
1,13,0.3%,

Value,Count,Frequency (%),Unnamed: 3
1,13,0.3%,
3,219,4.2%,
4,4860,93.5%,
5,106,2.0%,

Value,Count,Frequency (%),Unnamed: 3
1,13,0.3%,
3,219,4.2%,
4,4860,93.5%,
5,106,2.0%,

First 3 values
https://api.spotify.com/v1/tracks/5HRzqOUfdVsH...
https://api.spotify.com/v1/tracks/6tE4uNiF4gV7...
https://api.spotify.com/v1/tracks/5OOkp4U9P9oL...

Last 3 values
https://api.spotify.com/v1/tracks/0NYI7lXYvyEh...
https://api.spotify.com/v1/tracks/2QCEGIgwUPUw...
https://api.spotify.com/v1/tracks/3KGCF68zDut3...

Value,Count,Frequency (%),Unnamed: 3
https://api.spotify.com/v1/tracks/000xQL6tZNLJzIrtIgxqSl,1,0.0%,
https://api.spotify.com/v1/tracks/000xYdQfIZ4pDmBGzQalKU,1,0.0%,
https://api.spotify.com/v1/tracks/003eoIwxETJujVWmNFMoZy,1,0.0%,
https://api.spotify.com/v1/tracks/006pMMCuRo2TFjh8sNGSov,1,0.0%,
https://api.spotify.com/v1/tracks/007QSAaELpVxtX5Z6dMn0U,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
https://api.spotify.com/v1/tracks/7zmleW3XZx0uUsL2CkFuDe,1,0.0%,
https://api.spotify.com/v1/tracks/7zoLdBiyyafD9ixMHo0XP3,1,0.0%,
https://api.spotify.com/v1/tracks/7zvGNZu33bC4jZEkXHdDZ6,1,0.0%,
https://api.spotify.com/v1/tracks/7zwINh1X8wEhDCJLlSMjLF,1,0.0%,
https://api.spotify.com/v1/tracks/7zxRMhXxJMQCeDDg0rKAVo,1,0.0%,

0,1
Constant value,audio_features

First 3 values
spotify:track:61jrCPFpYqai9BlRaIPGHy
spotify:track:5RjvAk5BHecqAE2l44Btmy
spotify:track:7KtJ0irpRBvCo120daRCNK

Last 3 values
spotify:track:5VBinuoFThsLOallDS1h2V
spotify:track:6SUxrwVN19Kmd9JfIbjJUj
spotify:track:1gmarFWgSwb4SWlmqDjWka

Value,Count,Frequency (%),Unnamed: 3
spotify:track:000xQL6tZNLJzIrtIgxqSl,1,0.0%,
spotify:track:000xYdQfIZ4pDmBGzQalKU,1,0.0%,
spotify:track:003eoIwxETJujVWmNFMoZy,1,0.0%,
spotify:track:006pMMCuRo2TFjh8sNGSov,1,0.0%,
spotify:track:007QSAaELpVxtX5Z6dMn0U,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
spotify:track:7zmleW3XZx0uUsL2CkFuDe,1,0.0%,
spotify:track:7zoLdBiyyafD9ixMHo0XP3,1,0.0%,
spotify:track:7zvGNZu33bC4jZEkXHdDZ6,1,0.0%,
spotify:track:7zwINh1X8wEhDCJLlSMjLF,1,0.0%,
spotify:track:7zxRMhXxJMQCeDDg0rKAVo,1,0.0%,

0,1
Distinct count,991
Unique (%),19.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.50699
Minimum,0.0342
Maximum,0.977
Zeros (%),0.0%

0,1
Minimum,0.0342
5-th percentile,0.137
Q1,0.324
Median,0.505
Q3,0.69
95-th percentile,0.89
Maximum,0.977
Range,0.9428
Interquartile range,0.366

0,1
Standard deviation,0.23247
Coef of variation,0.45852
Kurtosis,-0.93058
Mean,0.50699
MAD,0.19586
Skewness,0.024255
Sum,2635.4
Variance,0.054041
Memory size,40.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0.458,18,0.3%,
0.661,16,0.3%,
0.42,15,0.3%,
0.505,15,0.3%,
0.305,15,0.3%,
0.669,15,0.3%,
0.541,14,0.3%,
0.648,14,0.3%,
0.406,14,0.3%,
0.486,14,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0.0342,1,0.0%,
0.0353,1,0.0%,
0.0368,1,0.0%,
0.037,1,0.0%,
0.0371,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.969,2,0.0%,
0.97,1,0.0%,
0.971,3,0.1%,
0.975,1,0.0%,
0.977,1,0.0%,

Unnamed: 0,index,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0,0.456,https://api.spotify.com/v1/audio-analysis/0EPx...,0.931,144039,0.604,0EPxmvsG1BY5td4aTOkWBF,0.0,0,0.0488,-1.41,0,0.0573,129.926,4,https://api.spotify.com/v1/tracks/0EPxmvsG1BY5...,audio_features,spotify:track:0EPxmvsG1BY5td4aTOkWBF,0.864
1,1,0.586,https://api.spotify.com/v1/audio-analysis/1a5Y...,0.789,192846,0.442,1a5Yu5L18qNxVhXx38njON,0.00366,11,0.0927,-7.844,1,0.0421,121.971,4,https://api.spotify.com/v1/tracks/1a5Yu5L18qNx...,audio_features,spotify:track:1a5Yu5L18qNxVhXx38njON,0.45
2,2,0.482,https://api.spotify.com/v1/audio-analysis/2wFn...,0.632,174653,0.95,2wFnRPjuqwNySFiK0Qfg8L,0.0,7,0.84,-2.85,0,0.111,154.006,4,https://api.spotify.com/v1/tracks/2wFnRPjuqwNy...,audio_features,spotify:track:2wFnRPjuqwNySFiK0Qfg8L,0.596
3,3,0.424,https://api.spotify.com/v1/audio-analysis/45wq...,0.855,174663,0.601,45wqPwIH4A5cr2IyyD4WKf,0.0,6,0.241,-5.902,0,0.0346,124.016,4,https://api.spotify.com/v1/tracks/45wqPwIH4A5c...,audio_features,spotify:track:45wqPwIH4A5cr2IyyD4WKf,0.776
4,4,0.0916,https://api.spotify.com/v1/audio-analysis/6zCq...,0.647,183627,0.93,6zCqeUUqLb2CapBmVwAHll,0.0,8,0.401,-3.147,0,0.0636,133.966,4,https://api.spotify.com/v1/tracks/6zCqeUUqLb2C...,audio_features,spotify:track:6zCqeUUqLb2CapBmVwAHll,0.848


In [50]:
charts.columns

Index(['index', 'Artist', 'Position', 'Streams', 'Track Name', 'URL',
       'data_chart', 'data_chart_int', 'track_id'],
      dtype='object')

In [51]:
features.columns

Index(['index', 'acousticness', 'analysis_url', 'danceability', 'duration_ms',
       'energy', 'id', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'speechiness', 'tempo', 'time_signature', 'track_href', 'type',
       'uri', 'valence'],
      dtype='object')

In [0]:
#Quais foram as musicas mais tocadas durante todos os carnavais?
hits_carnaval = pd.merge(charts,features,left_on='track_id',right_on='id')
hits_carnaval = hits_carnaval[hits_carnaval.data_chart_int.isin(carnaval)]
hits_carnaval = hits_carnaval.groupby('Track Name')['Streams'].sum()


In [63]:
hits_carnaval.shape

(422,)

In [62]:
hits_carnaval.sort_values(ascending=False)

Track Name
Vai malandra (feat. Tropkillaz & DJ Yuri Martins)                                 3741744.0
Envolvimento                                                                      3165621.0
Ta Tum Tum                                                                        2956198.0
Shape of You                                                                      2507543.0
Agora Vai Sentar                                                                  2490849.0
Vidinha de Balada - Ao Vivo                                                       2242244.0
Hear Me Now                                                                       2198989.0
Apelido Carinhoso                                                                 2173332.0
Amar Amei                                                                         2166756.0
Olha a Explosão                                                                   2151833.0
Rabiola                                                              

# Feature Engineering

# Pipeline ML