# Regresión Lineal, más allá del R^2

In [None]:
### Load relevant packages

import pandas                  as pd
import numpy                   as np
import matplotlib.pyplot       as plt
import seaborn                 as sns
import statsmodels.api         as sm
import statsmodels.formula.api as smf
import scipy
import os

# This statement allow to display plot without asking to 
%matplotlib inline

# always make it pretty 
plt.style.use('ggplot')

# Objetivos

El objetivo de este notebook es ilustrar acerca de los elementos a tener en cuenta al momento de realizar un análisis de regresión, entender los estadísticos que arroja en los outputs y aplicar ajustes de acuerdo con su interpetación.

## Carga de los datos

Fuente del dataset: https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho?select=CAR+DETAILS+FROM+CAR+DEKHO.csv


This dataset contains information about used cars listed on www.cardekho.com
This data can be used for a lot of purposes such as price prediction to exemplify the use of linear regression in Machine Learning.

The columns in the given dataset are as follows:

- name: This column should be filled with the name of the car.
- year: This column should be filled with the year in which the car was bought.
- selling_price: This column should be filled with the price the owner wants to sell the car at.
- km_driven: This is the distance completed by the car in km.
- fuel: Fuel type of the car.
- seller_type: Defines whether the seller is a dealer or an individual.
- transmission: Defines whether the car is manual or automatic.
- Owner: Defines the number of owners the car has previously had.

For used motorcycle datasets please go to https://www.kaggle.com/nehalbirla/motorcycle-dataset

In [None]:
# Data = pd.read_csv('data/CAR DETAILS FROM CAR DEKHO.csv',
Data = pd.read_csv('data/car data.csv',
    dtype = { # indicate categorical variables
        'year': 'category',
        'car_name': 'category',
        'fuel': 'category',
        'seller_type': 'category',
        'transmission': 'category',
        'owner': 'category'
    }
)



Data.head()

## Exploración de datos

In [None]:
varstolook = ['year', 'present_price', 'km_driven',
       'fuel', 'seller_type', 'transmission', 'owner']

cvarstolook = ['year', 'fuel', 'seller_type', 'transmission', 'owner']

plt.figure(figsize=(15,30))

for i,var in enumerate(varstolook):
    plt.subplot(5,3,i+1)
    if var in cvarstolook:
        sns.boxplot(x=var, y='selling_price', data=Data)
    else:
        sns.scatterplot(x=var,y='selling_price',data=Data, alpha=0.10,color='b')

In [None]:
sns.scatterplot(x='km_driven',y='selling_price',data=Data, alpha=0.50, hue = 'year')

## Ajustando un modelo de Regresión

In [None]:
formula_m1 = 'selling_price ~ year + present_price + km_driven + fuel + seller_type + transmission + owner'

model_1 = smf.ols(formula= formula_m1, data=Data).fit()
print(model_1.summary())

### Normalidad de los residuos

**Análisis *inicial* del modelo**

- ¿Qué podemos concluir de las variables usadas?: 
- ¿Qué podemos concluir sobre el ajuste del modelo?: 
- ¿Se podría confiar en este modelo para hacer estimaciones?:
- ¿Se cumplen los supuestos de Regresión?


In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,3,1)

model_1.resid.hist(bins = 100)
plt.title("Histogram for Residuals")


from scipy import stats


plt.subplot(2,3,2)

stats.probplot(x = model_1.resid, dist = "norm", plot = plt)
plt.title("QQ Plot for Residuals")
plt.show()

**¿Se cumplen los supuestos de Regresión?**

Respuesta: 

In [None]:
Data['log_selling_price'] = np.log(Data['selling_price'])

formula_m2 = 'log_selling_price ~ year + present_price + km_driven + fuel + seller_type + transmission + owner'

model_2 = smf.ols(formula= formula_m2, data=Data).fit()
print(model_2.summary())

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,3,1)

model_2.resid.hist(bins = 100)
plt.title("Histogram for Residuals")


from scipy import stats


plt.subplot(2,3,2)

stats.probplot(x = model_2.resid, dist = "norm", plot = plt)
plt.title("QQ Plot for Residuals")
plt.show()

In [None]:
formula_m3 = 'log_selling_price ~ year + present_price + fuel + seller_type + owner'

model_3 = smf.ols(formula= formula_m3, data=Data).fit()
print(model_3.summary())

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,3,1)

model_3.resid.hist(bins = 100)
plt.title("Histogram for Residuals")


from scipy import stats


plt.subplot(2,3,2)

stats.probplot(x = model_3.resid, dist = "norm", plot = plt)
plt.title("QQ Plot for Residuals")
plt.show()

In [None]:
plt.hist(model_3.resid, 
    density=True,     # the histogram integrates to 1 
                      # (so it can be compared to the normal distribution)
    bins=100,         #  draw a histogram with 100 bins of equal width
    label="residuals" # label for legend
    )
# now plot the normal distribution for comparison
xx = np.linspace(model_3.resid.min(), model_3.resid.max(), num=10000)
plt.plot(xx, scipy.stats.norm.pdf(xx, loc=0.0, scale=np.sqrt(model_3.scale)),
    label="normal distribution")
sns.rugplot(model_3.resid[np.abs(model_3.resid)>4*np.sqrt(model_3.scale)],
            color="C5", # otherwise the color was the same as the histogram
            label="outliers")
plt.legend(loc="upper left")
;

### Heterocedasticidad

In [None]:
formula_m4 = 'log_selling_price ~ present_price'

model_4 = smf.ols(formula= formula_m4, data=Data).fit()
print(model_4.summary())

In [None]:
sns.scatterplot(x = Data.present_price, y = model_4.resid)

In [None]:
formula_m5 = 'log_selling_price ~ np.log(present_price)'

model_5 = smf.ols(formula= formula_m5, data=Data).fit()
print(model_5.summary())

In [None]:
sns.scatterplot(x = np.log(Data.present_price), y = model_4.resid, data= Data)

### Multicolinealidad

In [None]:
mtcars = sm.datasets.get_rdataset("mtcars", "datasets", cache=True).data
mtcars = pd.DataFrame(mtcars)

mtcars

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(mtcars.corr(), cmap="RdYlBu", 
    annot=True, square=True,
    vmin=-0.8, vmax=0.8, fmt="+.1f")
plt.title("Correlations between predictors")

In [None]:
mtcars.corr().style.background_gradient(cmap = 'coolwarm')