In [1]:
# Importação das bibliotecas
import pandas as pd
from sklearn import preprocessing as pp
from sklearn import linear_model as lm
from sklearn import metrics as mt
import numpy as np

In [2]:
def calculate_metrics(y, yhat):
    '''
        Recebe as targets original e prevista, calcula as métricas e retorna um dicionário
    '''
    mse = round( mt.mean_squared_error( y, yhat ), 3)
    metrics = {'R2': round( mt.r2_score( y, yhat ), 3),
           'MSE': mse,
           'RMSE': round( np.sqrt( mse ), 3),
           'MAE': round( mt.mean_absolute_error( y, yhat ), 3),
           'MAPE': round( mt.mean_absolute_percentage_error( y, yhat ), 3)}   

    return metrics

In [3]:
def best_values(df_scores):
    '''
        Recebe um dataframe com as métricas encontradas para cada parâmetro
        Retorna um dataframe o melhor valor de cada métrica, e qual parâmetro utilizado para encontrá-lo
    '''
    df_best_values = pd.DataFrame(columns=['metric', 'performance', 'degree'])

    list_metrics = ['R2', 'MSE', 'RMSE', 'MAE', 'MAPE']
    for metric in list_metrics:
        if metric == 'R2':
            max_index = df_scores[metric].idxmax()
        else:
            max_index = df_scores[metric].idxmin()
        best_value = df_scores.loc[max_index, metric]
        best_degree = df_scores.loc[max_index, 'degree']
        df_best_values = df_best_values._append({'metric': metric, 'performance': best_value, 'degree': best_degree}, ignore_index=True)

    return df_best_values
    

In [4]:
# Features selecionadas
features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']

In [5]:
# Carregamento dos dados de treino
X_train = pd.read_csv( 'X_training.csv' )
y_train = pd.read_csv( 'y_training.csv' )

# Preparação dos dados de treino
X_train = X_train.loc[:, features]
y_train = y_train.values.ravel()

In [6]:
# Carregamento dos dados de validação:
X_val = pd.read_csv( 'X_validation.csv' )
y_val = pd.read_csv( 'y_val.csv' )

# Preparação dos dados de validação
X_val = X_val.loc[:, features]
y_val = y_val.values.ravel()

In [7]:
# Carregamento dos dados de teste
X_test = pd.read_csv( 'X_test.csv' )
y_test = pd.read_csv( 'y_test.csv' )

# Preparação dos dados de teste
X_test = X_test.loc[:, features]
y_test = y_test.values.ravel()

In [8]:
# Treinamento do modelo com os parâmetros padrão do algoritmo

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures()
X_train_poly = poly_features.fit_transform(X_train)
    
# Treinamento do modelo com os parâmetros padrão do algoritmo
model = lm.LinearRegression()
model.fit( X_train_poly, y_train )

In [9]:
# Cálculo das métricas para os dados de treinamento com os parametros padrão, a objetivo de comparação
df_scores_train_ini = pd.DataFrame(columns=['R2', 'MSE', 'RMSE', 'MAE', 'MAPE'])

X_train_poly = poly_features.transform(X_train)
yhat_train = model.predict( X_train_poly)
res = calculate_metrics(y_train, yhat_train)
df_scores_train_ini = df_scores_train_ini._append({'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],
                                               'MAE': res['MAE'], 'MAPE': res['MAPE']}, ignore_index=True)

df_scores_train_ini

  df_scores_train_ini = df_scores_train_ini._append({'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],


Unnamed: 0,R2,MSE,RMSE,MAE,MAPE
0,0.094,432.986,20.808,16.458,8.351


In [10]:
# Cálculo das métricas para os dados de validação com os parametros padrão, a objetivo de comparação
df_scores_train_val = pd.DataFrame(columns=['R2', 'MSE', 'RMSE', 'MAE', 'MAPE'])

X_val_poly = poly_features.transform(X_val)
yhat_val = model.predict( X_val_poly)
res = calculate_metrics(y_val, yhat_val)
df_scores_train_val = df_scores_train_val._append({'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],
                                               'MAE': res['MAE'], 'MAPE': res['MAPE']}, ignore_index=True)

df_scores_train_val

  df_scores_train_val = df_scores_train_val._append({'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],


Unnamed: 0,R2,MSE,RMSE,MAE,MAPE
0,0.066,445.768,21.113,16.75,8.548


In [11]:
# Treinar os modelos variandos os parâmetros e armazenar os valores em dataframes

df_scores_params_train = pd.DataFrame(columns=['degree', 'R2', 'MSE', 'RMSE', 'MAE', 'MAPE'])
df_scores_params_val = pd.DataFrame(columns=['degree', 'R2', 'MSE', 'RMSE', 'MAE', 'MAPE'])

best_degree = [i for i in range( 1, 6)]
for i in best_degree:

    # Transformação polinomial do dataset
    poly_features = pp.PolynomialFeatures(degree=i)
    X_train_poly = poly_features.fit_transform(X_train)
    
    # model training
    model = lm.LinearRegression()
    model.fit( X_train_poly, y_train )

    # Coletando as métricas para os dados de treinamento
    X_train_poly = poly_features.transform(X_train)
    yhat_train = model.predict( X_train_poly)
    res = calculate_metrics(y_train, yhat_train)   

    df_scores_params_train = df_scores_params_train._append({'degree': i, 'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],
                                                             'MAE': res['MAE'], 'MAPE': res['MAPE']}, ignore_index=True)

  
    # Coletando as métricas para os dados de validação
    X_val_poly = poly_features.transform(X_val) 
    yhat_val = model.predict( X_val_poly)
    res = calculate_metrics(y_val, yhat_val)

    df_scores_params_val = df_scores_params_val._append({'degree': i, 'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],
                                                             'MAE': res['MAE'], 'MAPE': res['MAPE']}, ignore_index=True) 

  df_scores_params_train = df_scores_params_train._append({'degree': i, 'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],
  df_scores_params_val = df_scores_params_val._append({'degree': i, 'R2': res['R2'], 'MSE': res['MSE'], 'RMSE': res['RMSE'],


In [12]:
# Determinar as melhores performances para os dados de treino e validação
df_best_values_train = best_values(df_scores_params_train)
df_best_values_val = best_values(df_scores_params_val)
df_best_values_val

  df_best_values = df_best_values._append({'metric': metric, 'performance': best_value, 'degree': best_degree}, ignore_index=True)
  df_best_values = df_best_values._append({'metric': metric, 'performance': best_value, 'degree': best_degree}, ignore_index=True)


Unnamed: 0,metric,performance,degree
0,R2,0.066,2.0
1,MSE,445.768,2.0
2,RMSE,21.113,2.0
3,MAE,16.75,2.0
4,MAPE,8.548,2.0


In [13]:
# Concatenar os dados de treino e validação para uso da técnica de validação holdout com os dados de teste
X_trainval = pd.concat([X_train, X_val], ignore_index=True)
y_trainval = np.concatenate((y_train, y_val), axis=None)

In [14]:
# Agora vamos medir a performance de teste individualmente para cada uma das métricas


# R2
best_degree = int(df_best_values_val[df_best_values_val['metric']=='R2']['degree'].item())

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures(degree=best_degree)
X_trainval_poly = poly_features.fit_transform(X_trainval)

# Performance de teste:
model = lm.LinearRegression()
model.fit( X_trainval_poly, y_trainval )

X_test_poly = poly_features.transform(X_test) 
yhat_test = model.predict( X_test_poly)
res = calculate_metrics(y_test, yhat_test)
R2_test = res['R2']

In [15]:
# MSE
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='MSE']['degree'].item())

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures(degree=best_degree)
X_trainval_poly = poly_features.fit_transform(X_trainval)

# Performance de teste:
model = lm.LinearRegression()
model.fit( X_trainval_poly, y_trainval )

X_test_poly = poly_features.transform(X_test) 
yhat_test = model.predict( X_test_poly)
res = calculate_metrics(y_test, yhat_test)
MSE_test = res['MSE']

In [16]:
# RMSE
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='RMSE']['degree'].item())

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures(degree=best_degree)
X_trainval_poly = poly_features.fit_transform(X_trainval)

# Performance de teste:
model = lm.LinearRegression()
model.fit( X_trainval_poly, y_trainval )

X_test_poly = poly_features.transform(X_test) 
yhat_test = model.predict( X_test_poly)
res = calculate_metrics(y_test, yhat_test)
RMSE_test = res['RMSE']

In [17]:
# MAE
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='MAE']['degree'].item())

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures(degree=best_degree)
X_trainval_poly = poly_features.fit_transform(X_trainval)

# Performance de teste:
model = lm.LinearRegression()
model.fit( X_trainval_poly, y_trainval )

X_test_poly = poly_features.transform(X_test) 
yhat_test = model.predict( X_test_poly)
res = calculate_metrics(y_test, yhat_test)
MAE_test = res['MAE']

In [18]:
# MAPE
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='MAPE']['degree'].item())

# Transformação polinomial do dataset
poly_features = pp.PolynomialFeatures(degree=best_degree)
X_trainval_poly = poly_features.fit_transform(X_trainval)

# Performance de teste:
model = lm.LinearRegression()
model.fit( X_trainval_poly, y_trainval )

X_test_poly = poly_features.transform(X_test) 
yhat_test = model.predict( X_test_poly)
res = calculate_metrics(y_test, yhat_test)
MAPE_test = res['MAPE']

In [19]:
# Construindo um dataframe com os valores finais de performance para facilitar a visualização

R2_train = df_best_values_train[df_best_values_train['metric'] == 'R2']['performance'].item()
MSE_train = df_best_values_train[df_best_values_train['metric'] == 'MSE']['performance'].item()
RMSE_train = df_best_values_train[df_best_values_train['metric'] == 'RMSE']['performance'].item()
MAE_train = df_best_values_train[df_best_values_train['metric'] == 'MAE']['performance'].item()
MAPE_train = df_best_values_train[df_best_values_train['metric'] == 'MAPE']['performance'].item()

R2_val = df_best_values_val[df_best_values_val['metric'] == 'R2']['performance'].item()
MSE_val = df_best_values_val[df_best_values_val['metric'] == 'MSE']['performance'].item()
RMSE_val = df_best_values_val[df_best_values_val['metric'] == 'RMSE']['performance'].item()
MAE_val = df_best_values_val[df_best_values_val['metric'] == 'MAE']['performance'].item()
MAPE_val = df_best_values_val[df_best_values_val['metric'] == 'MAPE']['performance'].item()


df_scores_final = pd.DataFrame(columns=['dataset','R2', 'MSE', 'RMSE', 'MAE', 'MAPE'])

df_scores_final = df_scores_final._append({'dataset': 'Treinamento', 'R2': R2_train, 'MSE': MSE_train,
                                          'RMSE': RMSE_train, 'MAE': MAE_train, 'MAPE': MAPE_train }, ignore_index=True) 

df_scores_final = df_scores_final._append({'dataset': 'Validação', 'R2': R2_val, 'MSE': MSE_val,
                                          'RMSE': RMSE_val, 'MAE': MAE_val, 'MAPE': MAPE_val}, ignore_index=True) 

df_scores_final = df_scores_final._append({'dataset': 'Teste', 'R2': R2_test, 'MSE': MSE_test,
                                          'RMSE': RMSE_test, 'MAE': MAE_test, 'MAPE': MAPE_test}, ignore_index=True) 


df_scores_final

  df_scores_final = df_scores_final._append({'dataset': 'Treinamento', 'R2': R2_train, 'MSE': MSE_train,


Unnamed: 0,dataset,R2,MSE,RMSE,MAE,MAPE
0,Treinamento,0.724,132.07,11.492,7.301,2.241
1,Validação,0.066,445.768,21.113,16.75,8.548
2,Teste,0.091,442.641,21.039,16.736,8.277
