# boston price data

## [1] 데이터 불러오기

In [82]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
boston_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

a = boston_df.iloc[::2, :].reset_index()
b = boston_df.iloc[1::2, :3].reset_index()

boston_df = pd.concat((a, b), axis=1, ignore_index=True).drop(columns=[0, 12])

In [75]:
columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
# CRIM     per capita crime rate by town
# ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS    proportion of non-retail business acres per town
# CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# NOX      nitric oxides concentration (parts per 10 million)
# RM       average number of rooms per dwelling
# AGE      proportion of owner-occupied units built prior to 1940
# DIS      weighted distances to five Boston employment centres
# RAD      index of accessibility to radial highways
# TAX      full-value property-tax rate per $10,000
# PTRATIO  pupil-teacher ratio by town
# B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT    % lower status of the population
# MEDV     Median value of owner-occupied homes in $1000's

In [85]:
boston_df.columns = columns

In [90]:
target = boston_df["CRIM"] 
data = boston_df.iloc[:, 1:]

X = data.values
y = target.values

## [2] linear_regression 사용

In [151]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LinearRegression, Ridge

train_X, test_X, train_y, test_y = train_test_split(
    X, y, random_state=42
)

pipe_linear =Pipeline(
    [('scaler',StandardScaler()),('linear_regression',LinearRegression())]
)

pipe_linear.fit(train_X, train_y)

train_score = pipe_linear.score(train_X, train_y)
test_score = pipe_linear.score(test_X, test_y)

print(train_score, test_score)

0.4425127743864863 0.4523810610223854


## [3] ridge 사용

In [153]:
pipe_ridge = Pipeline(
    [("scaler", StandardScaler()), ("Ridge", Ridge(alpha=0.1))]
)

pipe_ridge.fit(train_X, train_y)

train_score = pipe_ridge.score(train_X, train_y)
test_score = pipe_ridge.score(test_X, test_y)

print(train_score, test_score)

0.44251244006084944 0.45240678868028816


In [155]:
# 알파별로 coef 확인해보기
pipe_ridge_1 = Pipeline(
    [("scaler", StandardScaler()), ("Ridge", Ridge(alpha=0.1))]
)

pipe_ridge_2 = Pipeline(
    [("scaler", StandardScaler()), ("Ridge", Ridge(alpha=1))]
)

pipe_ridge_3 = Pipeline(
    [("scaler", StandardScaler()), ("Ridge", Ridge(alpha=10))]
)

pipe_ridge_1.fit(train_X, train_y)
pipe_ridge_2.fit(train_X, train_y)
pipe_ridge_3.fit(train_X, train_y)

Pipeline(steps=[('scaler', StandardScaler()), ('Ridge', Ridge(alpha=10))])

In [160]:
pipe_ridge_1["Ridge"].coef_

array([ 0.94319593, -0.13435683, -0.25263938, -1.37058934,  0.41960503,
        0.19004106, -2.11290447,  5.16508069, -0.433922  , -0.73096995,
        0.02681296,  0.37549664, -2.07478295])

In [162]:
pipe_ridge_2["Ridge"].coef_

array([ 0.92534332, -0.1532928 , -0.25203058, -1.3358872 ,  0.41504691,
        0.18951162, -2.07713989,  5.06873882, -0.34561684, -0.71092946,
        0.01927789,  0.38972566, -2.03839   ])

In [163]:
pipe_ridge_3["Ridge"].coef_

array([ 0.79299492, -0.26388195, -0.25011443, -1.05116507,  0.37382597,
        0.18694564, -1.78451991,  4.35818216,  0.26621402, -0.54692191,
       -0.04245037,  0.48992803, -1.75130563])