<a href="https://colab.research.google.com/github/dudumlc/ML_Regression/blob/main/ML_Previs%C3%A3o_Pre%C3%A7o_Carros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTAÇÃO DAS BIBLIOTECAS A SEREM USADAS

In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, scale
from ipywidgets import widgets, interact
from IPython.display import display, clear_output

## IMPORTAÇÃO DO DATASET E PRÉ-VISUALIZAÇÃO

In [109]:
df_raw = pd.read_csv('/content/C+¦pia de CarPrice_Assignment.csv')

In [110]:
# Copiando outro df para trabalhar para ter o dataset bruto como backup se necessário
df = df_raw.copy()

In [111]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## DATA CLEANING

In [112]:
# Conferência se há dados nulos inclusos nas colunas
df.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [113]:
# Conferência se há linhas duplicadas inclusas
df.duplicated().sum()

0

In [114]:
# Análise dos tipos de dados de cada coluna
df.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [115]:
# Alterando tipo 'object' para 'category' (mais adequado e aida economiza memória)
object_cols = df.select_dtypes(include='object').columns
df[object_cols] = df[object_cols].astype('category')

In [116]:
# Colunas com tipos de dados alterados
df.dtypes

car_ID                 int64
symboling              int64
CarName             category
fueltype            category
aspiration          category
doornumber          category
carbody             category
drivewheel          category
enginelocation      category
wheelbase            float64
carlength            float64
carwidth             float64
carheight            float64
curbweight             int64
enginetype          category
cylindernumber      category
enginesize             int64
fuelsystem          category
boreratio            float64
stroke               float64
compressionratio     float64
horsepower             int64
peakrpm                int64
citympg                int64
highwaympg             int64
price                float64
dtype: object

In [145]:
# Análise de outliers nas colunas numéricas - TABULARMENTE
def is_outlier(array, extreme=False):
  q1,q3 = np.quantile(array, [.25,.75])
  iqr = q3-q1

  factor = 3. if extreme else 1.5
  upper_outlier = q3 + factor*iqr
  lower_outlier = q1 - factor*iqr

  return (array < lower_outlier) | (array > upper_outlier)

num_cols = df.select_dtypes(include='number').columns
df[num_cols].apply(is_outlier).sum()

car_ID               0
symboling            0
wheelbase            3
carlength            1
carwidth             8
carheight            0
curbweight           0
enginesize          10
boreratio            0
stroke              20
compressionratio    28
horsepower           6
peakrpm              2
citympg              2
highwaympg           3
price               15
dtype: int64

In [136]:
# Análise de outliers nas colunas numéricas - BOXPLOT
@interact(coluna=df.select_dtypes(include='number').columns)
def outlier(coluna):
    plt.figure(figsize=(9,5))
    sns.boxplot(data=df,x=df[coluna])
    plt.show()

interactive(children=(Dropdown(description='coluna', options=('car_ID', 'symboling', 'wheelbase', 'carlength',…

## DATA WRANGLING

In [77]:
# Análise da granularidade de cada coluna categórica para definir a estratégia de transformação dessas categorias

for i in df.select_dtypes(include='category'):
  print("Coluna:",i)
  print("Categorias e quantidades:",df[i].unique())
  print()

Coluna: CarName
Categorias e quantidades: ['alfa-romero giulia', 'alfa-romero stelvio', 'alfa-romero Quadrifoglio', 'audi 100 ls', 'audi 100ls', ..., 'volvo 244dl', 'volvo 245', 'volvo 264gl', 'volvo diesel', 'volvo 246']
Length: 147
Categories (147, object): ['Nissan versa', 'alfa-romero Quadrifoglio', 'alfa-romero giulia',
                           'alfa-romero stelvio', ..., 'volvo 264gl', 'volvo diesel', 'vw dasher',
                           'vw rabbit']

Coluna: fueltype
Categorias e quantidades: ['gas', 'diesel']
Categories (2, object): ['diesel', 'gas']

Coluna: aspiration
Categorias e quantidades: ['std', 'turbo']
Categories (2, object): ['std', 'turbo']

Coluna: doornumber
Categorias e quantidades: ['two', 'four']
Categories (2, object): ['four', 'two']

Coluna: carbody
Categorias e quantidades: ['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop']
Categories (5, object): ['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon']

Coluna: drivewheel
Categorias e quantida

In [78]:
# Coluna doornumber e cylindernumber tratam-se de número
df[['doornumber','cylindernumber']] = df[['doornumber','cylindernumber']].replace({'one':1,
                                                                                   'two':2,
                                                                                   'three':3,
                                                                                   'four':4,
                                                                                   'five':5,
                                                                                   'six':6,
                                                                                   'seven':7,
                                                                                   'eight':8,
                                                                                   'nine':9,
                                                                                   'ten':10})

In [80]:
# Colunas categóricas restantes serão transformadas em dummies
df = pd.get_dummies(df, columns=df.select_dtypes(include='category').columns)

In [81]:
# Análise das escalas de cada coluna para ver a necessidade de normalização dos dados
df.describe().loc[['min','max']]

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,...,cylindernumber_twelve,cylindernumber_2,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [86]:
# Normalização dos dados para a diferença de escala entre as colunas não atrapalhe a regressão

df_norm = pd.DataFrame(MinMaxScaler().fit_transform(df), index=df.index, columns=df.columns)

## ANÁLISE EXPLORATÓRIA DOS DADOS

In [89]:
df_norm

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,...,cylindernumber_twelve,cylindernumber_2,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,0.000000,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.004902,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.009804,0.6,0.230321,0.449254,0.433333,0.383333,0.517843,0.343396,0.100000,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.014706,0.8,0.384840,0.529851,0.491667,0.541667,0.329325,0.181132,0.464286,0.633333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.019608,0.8,0.373178,0.529851,0.508333,0.541667,0.518231,0.283019,0.464286,0.633333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.980392,0.2,0.655977,0.711940,0.716667,0.641667,0.567882,0.301887,0.885714,0.514286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
201,0.985294,0.2,0.655977,0.711940,0.708333,0.641667,0.605508,0.301887,0.885714,0.514286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
202,0.990196,0.2,0.655977,0.711940,0.716667,0.641667,0.591156,0.422642,0.742857,0.380952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
203,0.995098,0.2,0.655977,0.711940,0.716667,0.641667,0.670675,0.316981,0.335714,0.633333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
