## Desafío Miércoles Semana 5 - Regresión desde el aprendizaje de máquinas
###  Gustavo Morales, G10 - 09.Oct.2019

#### Ejercicio 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing

In [2]:
df = pd.read_csv("boston.csv")
df = df.drop(columns='Unnamed: 0')

In [3]:
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
crim       506 non-null float64
zn         506 non-null float64
indus      506 non-null float64
chas       506 non-null int64
nox        506 non-null float64
rm         506 non-null float64
age        506 non-null float64
dis        506 non-null float64
rad        506 non-null int64
tax        506 non-null int64
ptratio    506 non-null float64
black      506 non-null float64
lstat      506 non-null float64
medv       506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


#### Ejercicio 2

In [5]:
X = df.drop(['medv'], axis=1)  # features

In [6]:
y = df['medv']  # target

In [7]:
print(f'Sample size = {y.shape[0]}')

Sample size = 506


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
print(f'Train size = {X_train.shape[0]}')
print(f'Test size = {X_test.shape[0]}')
print(f'Number of features = {X_train.shape[1]}')

Train size = 339
Test size = 167
Number of features = 13


#### Ejercicio 3

In [10]:
X_train_scaled = preprocessing.scale(X_train)
y_train_scaled = preprocessing.scale(y_train)
X_test_scaled = preprocessing.scale(X_test)
y_test_scaled = preprocessing.scale(y_test)

In [11]:
lfit_unnormed = LinearRegression().fit(X_train, y_train)
lfit_normed = LinearRegression().fit(X_train_scaled, y_train_scaled)

In [12]:
# model evaluation for training and testing sets, normalized and unnormalized
y_train_predict_u = lfit_unnormed.predict(X_train)
y_test_predict_u = lfit_unnormed.predict(X_test)
y_train_predict_n = lfit_normed.predict(X_train_scaled)
y_test_predict_n = lfit_normed.predict(X_test_scaled)

#### Ejercicio 4

In [13]:
def report_scores(true, predicted):
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    print(f'RMSE is {rmse}')
    print(f'R2 score is {r2}')

In [14]:
report_scores(y_train, y_train_predict_u)

RMSE is 4.794269062151269
R2 score is 0.7392344370995025


In [15]:
report_scores(y_train_scaled, y_train_predict_n)

RMSE is 0.510652095756492
R2 score is 0.7392344370995027


In [16]:
report_scores(y_test, y_test_predict_u)

RMSE is 4.5523645984630585
R2 score is 0.726157083655248


In [17]:
report_scores(y_test_scaled, y_test_predict_n)

RMSE is 0.5166090326343593
R2 score is 0.7331151074005915


**(R)** El mejor modelo es el normalizado, por que tiene menor RMSE. De hecho es lógico que el $R^2$ no cambie, porque es independiente de la normalización.

#### Ejercicio 5

In [18]:
def fetch_features(dataframe, target='medv'):
    df_c = dataframe.corr()[target].abs().sort_values(ascending=False)
    return df_c[1:].head(6)

In [19]:
fetch_features(df)

lstat      0.737663
rm         0.695360
ptratio    0.507787
indus      0.483725
tax        0.468536
nox        0.427321
Name: medv, dtype: float64

#### Ejercicio 6

In [20]:
yy = df['medv']
XX = df[fetch_features(df).index.tolist()]

In [21]:
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.33, random_state=42)

In [22]:
XX_train_scaled = preprocessing.scale(XX_train)
yy_train_scaled = preprocessing.scale(yy_train)
XX_test_scaled = preprocessing.scale(XX_test)
yy_test_scaled = preprocessing.scale(yy_test)

In [23]:
lfit_normed2 = LinearRegression().fit(XX_train_scaled, yy_train_scaled)

In [24]:
# model evaluation for training and testing sets, normalized
yy_train_predict_n = lfit_normed2.predict(XX_train_scaled)
yy_test_predict_n = lfit_normed2.predict(XX_test_scaled)

In [25]:
report_scores(yy_train_scaled, yy_train_predict_n)

RMSE is 0.5641405337751357
R2 score is 0.6817454581519049


In [26]:
report_scores(yy_test_scaled, yy_test_predict_n)

RMSE is 0.5719847476385094
R2 score is 0.6728334484689107


## **(R)** Vemos que

#### Ejercicio 7