## Etapas do projeto:
1. Definição do Problema de Negócio e Objetivo do projeto
2. Coleta e preparação dos dados
3. Análise Exploratória de Dados
4. Pré-processamento de Dados
5. Modelagem e Avaliação do Modelo
6. Interpretação dos Resultados
7. Deploy do Modelo
8. Aplicação do Modelo

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [3]:
#Importando o arquivo csv
dataset = pd.read_csv('C:/Users/guipi/Desktop/tcc codigo/mostpopularsongsalltime.csv', encoding = "ISO-8859-1")

In [4]:
dataset.head()

Unnamed: 0,Track Name,Artist Name,Album Name,Release Date,Popularity,Duration (ms),Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo
0,Blinding Lights,The Weeknd,After Hours (Deluxe),2020-04-03,49,200046,0.518,0.727,1,-5.947,1,0.0557,0.00153,0.000152,0.0882,0.342,170.962
1,Shape of You,Ed Sheeran,Ã· (Deluxe),2017-03-03,88,233713,0.825,0.652,1,-3.183,0,0.0802,0.581,0.0,0.0931,0.931,95.977
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent (Exten...,2019-11-22,50,182173,0.501,0.405,1,-5.679,1,0.0319,0.751,0.0,0.105,0.446,109.891
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,86,157560,0.755,0.522,2,-4.368,1,0.0575,0.533,0.0,0.0685,0.925,89.96
4,Starboy,The Weeknd,Starboy,2016-11-25,93,230453,0.679,0.587,7,-7.015,1,0.276,0.141,6e-06,0.137,0.486,186.003


In [5]:
#Overview dos tipos do TCC
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Track Name        2001 non-null   object 
 1   Artist Name       2001 non-null   object 
 2   Album Name        2001 non-null   object 
 3   Release Date      2001 non-null   object 
 4   Popularity        2001 non-null   int64  
 5   Duration (ms)     2001 non-null   int64  
 6   Danceability      2001 non-null   float64
 7   Energy            2001 non-null   float64
 8   Key               2001 non-null   int64  
 9   Loudness          2001 non-null   float64
 10  Mode              2001 non-null   int64  
 11  Speechiness       2001 non-null   float64
 12  Acousticness      2001 non-null   float64
 13  Instrumentalness  2001 non-null   float64
 14  Liveness          2001 non-null   float64
 15  Valence           2001 non-null   float64
 16  Tempo             2001 non-null   float64


In [6]:
dataset['Release Date'] = pd.to_datetime(dataset['Release Date'])

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Track Name        2001 non-null   object        
 1   Artist Name       2001 non-null   object        
 2   Album Name        2001 non-null   object        
 3   Release Date      2001 non-null   datetime64[ns]
 4   Popularity        2001 non-null   int64         
 5   Duration (ms)     2001 non-null   int64         
 6   Danceability      2001 non-null   float64       
 7   Energy            2001 non-null   float64       
 8   Key               2001 non-null   int64         
 9   Loudness          2001 non-null   float64       
 10  Mode              2001 non-null   int64         
 11  Speechiness       2001 non-null   float64       
 12  Acousticness      2001 non-null   float64       
 13  Instrumentalness  2001 non-null   float64       
 14  Liveness          2001 n

In [8]:
dataset.shape

(2001, 17)

In [9]:
#Verificando se há valores nulos ou faltantes nos dados
missing = dataset.isnull().sum()
missing

Track Name          0
Artist Name         0
Album Name          0
Release Date        0
Popularity          0
Duration (ms)       0
Danceability        0
Energy              0
Key                 0
Loudness            0
Mode                0
Speechiness         0
Acousticness        0
Instrumentalness    0
Liveness            0
Valence             0
Tempo               0
dtype: int64

3.Análise Exploratória e Visualização dos dados

In [None]:
dataset.describe()

In [None]:
print((dataset == 0).sum())

In [None]:
#Visualizando a distribuição da popularidade
plt.figure(figsize=(6,3))
sns.histplot(dataset['Popularity'], kde=True)
plt.title('Popularidade', fontsize=14)
plt.show()

In [None]:
#Boxplot - verificação de outliers
plt.figure(figsize=(6,3))
sns.boxplot(x= dataset['Popularity'])
plt.title('Popularidade', fontsize=14)
plt.show()

In [None]:
#Matriz de correlação
matriz_correlacao = dataset.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(matriz_correlacao, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
#Histogramas
dataset.hist(figsize=(12, 9))
plt.show()

In [None]:
#Ano de lançamento
plt.figure(figsize=(10, 6))
plt.hist(dataset['Release Date'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribuição dos anos de lançamento')
plt.xlabel('Release Date')
plt.ylabel('Frequência')
plt.show()

### 4. Pré processamento dos dados

In [None]:
#One hot encoding
#dataset = pd.get_dummies(dataset, columns=['Key'], prefix='Key', drop_first=False, dtype='int64')
#dataset = pd.get_dummies(dataset, columns=['Mode'], prefix='Mode', drop_first=False, dtype='int64')

In [None]:
#Deletando colunas de texto, que não serão necessárias para a análise
colunasremovidas = ['Track Name', 'Artist Name', 'Album Name', 'Instrumentalness']
dataset = dataset.drop(colunasremovidas, axis=1)

In [None]:
dataset['Release Date'] = dataset['Release Date'].dt.year
dataset['Release Date'] = dataset['Release Date'].astype('int64')

In [None]:
dataset.columns

In [None]:
ordemcolunas = [ 'Release Date',
       'Duration (ms)', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Valence', 'Tempo',
       'Key_0', 'Key_1', 'Key_2', 'Key_3', 'Key_4', 'Key_5', 'Key_6', 'Key_7',
       'Key_8', 'Key_9', 'Key_10', 'Key_11', 'Mode_0', 'Mode_1', 'Popularity']
dataset = dataset[ordemcolunas]

In [None]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:, -1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

### Modelagem e Avaliação do Modelo

In [None]:
#Random Forest com os melhores hiperparametros - utilizando o Grid Search (Pode demorar de 1 a 2 minutos para rodar)
param_grid = {'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto']}

rf_model = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

#Melhores hiperparâmetros encontrados
best_params = grid_search.best_params_
best_rf_model = RandomForestRegressor(**best_params)
best_rf_model.fit(X_train, y_train)

#Previsão
y_pred = best_rf_model.predict(X_test)

#Scores
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

In [None]:
#Árvore de Decisão
decision_tree_model = DecisionTreeRegressor(max_depth=3, ccp_alpha=3)
decision_tree_model.fit(X_train, y_train)

#Prediction
y_pred = decision_tree_model.predict(X_test)

#Scores
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))