In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp -r drive/My\ Drive/real-estate .
!ls real-estate

In [0]:
import os
import glob
import re
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.plotting.register_matplotlib_converters()

In [0]:
filenames = glob.glob(str(Path() / 'real-estate' / '*'))

prices = []
for filename in filenames:
    prices.append(pd.read_excel(
        filename,
        header=16,
        usecols=[
            '시군구',
            '주택유형',
            '도로조건',
            '연면적(㎡)',
            '대지면적(㎡)',
            '계약년월',
            '거래금액(만원)',
            '건축년도',
        ],
        thousands=',',
        dtype={
            '주택유형': 'category',
        }
    ).dropna())
price = pd.concat(prices).reset_index(drop=True)

price.columns = [
    'si_gun_gu',
    'housing_type',
    'distance_to_road',
    'total_floor_area',
    'plottage',
    'date_of_contract',
    'price',
    'construction_year'
]
price = price[price['construction_year'] > 1900].astype(
    {'construction_year': int}
)
price['distance_to_road'] = price['distance_to_road'].replace(
    ['8m미만', '12m미만', '25m미만', '-', '25m이상'],
    [4.0, 10.0, 18.5, np.nan, 50.0],
)
price = price.dropna().reset_index(drop=True)
price['date_of_contract'] = pd.to_datetime(
    price['date_of_contract'].astype(str),
    format='%Y%m'
)

si_gun_gu = price['si_gun_gu'].str.split(n=2, expand=True).fillna('')
si_gun_gu.columns = ['sido', 'sigungu', 'dongli']

price = pd.concat([price, si_gun_gu], axis=1).drop(columns='si_gun_gu')

In [0]:
index = (
    (price['sido'] == '서울특별시')
    & (price['price'] < 100000)
    & (price['plottage'] < 1000)
    & (price['total_floor_area'] < 250)
)
print('Number of samples:', index.sum())
features = ['total_floor_area', 'plottage', 'construction_year']
data = price.loc[index, features]
target = price.loc[index, 'price']

# Scikit learn

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=42,
)
print('Number of training set:', len(y_train))
print('Number of test set:', len(y_test))

In [0]:
X_train.head()

In [0]:
y_train.head()

In [0]:
from sklearn.metrics import mean_squared_error

def rooted_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def prediction_plot(y_true, y_pred):
    print(f'RMSE: {rooted_mean_squared_error(y_true, y_pred):.3f}')
    fig, ax = plt.subplots(figsize=plt.figaspect(1))
    ax.plot(y_true, y_pred, '.', alpha=0.2)
    
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xymin = min([xlim[0], ylim[0]])
    xymax = max([xlim[1], ylim[1]])
    
    ax.plot([xymin, xymax], [xymin, xymax])
    
    ax.set(
        xlim=[xymin, xymax],
        ylim=[xymin, xymax],
        xlabel='Given',
        ylabel='Predicted',
    )

In [0]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
from sklearn.svm import LinearSVR

model = LinearSVR(max_iter=100_000)

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

model = Pipeline([
    ('regressor', KNeighborsRegressor()),
])

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

model = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor()),
])

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

model = Pipeline([
    ('selector', SelectKBest(f_regression, k=2)),
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor()),
])

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
print(model['selector'].scores_)
print(model['selector'].get_support())

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

pipe = Pipeline([
    ('selector', SelectKBest(f_regression, k=2)),
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor()),
])

param_grid = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'regressor__n_neighbors': [3, 5, 10],
}

model = GridSearchCV(
    pipe,
    param_grid,
    scoring=make_scorer(rooted_mean_squared_error, greater_is_better=False),
    cv=KFold(5),
    n_jobs=-1,
)

# ==========

model.fit(X_train, y_train)
predicted = model.predict(X_test)

prediction_plot(y_test, predicted)

In [0]:
model.best_params_

In [0]:
pd.DataFrame(model.cv_results_)