# サンプル

## データの前処理

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Pandas に読み込み
df = pd.read_csv('data/train.csv')

In [None]:
# 先頭5件を確認する
df.head()

In [None]:
# 何行・何列か確認する
df.shape

In [None]:
# カラムの一覧
df.columns

In [None]:
# 要約を確認する
df.describe()

In [None]:
# 分布を確認する
df['SalePrice'].hist()

In [None]:
# 散布図で2つの変数の相関を確認する
df.plot(kind='scatter', x='YearBuilt', y='SalePrice')

In [None]:
# 欠損値の確認
df.isnull().sum()[df.isnull().sum()>=1]

In [None]:
# 中央値で埋める場合
df['LotFrontage'] = df["LotFrontage"].fillna(df["LotFrontage"].mean())

# レコード毎削除する場合
#df = df.dropna(subset=['LotFrontage'])

In [None]:
# ダミー変数を作る
df = pd.get_dummies(data=df, columns=["SaleType"])

df['Alley'] = df["Alley"].fillna("Other")
df = pd.get_dummies(data=df, columns=["Alley"])

## モデルの構築

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
target = "SalePrice"
exclude = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleCondition',
       'SalePrice']
features = []
for col in df.columns:
    if col not in exclude:
        features.append(col)

In [None]:
X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3) # データを7:3に分ける

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
rmse = mean_squared_error(y_test, y_pred)
print(np.sqrt(rmse))

In [None]:
coef = pd.DataFrame(lm.coef_, columns=["coef"], index=features)
coef

In [None]:
coef.plot(kind="bar")

## 予測結果の出力

In [None]:
# ファイル読み込み
test = pd.read_csv('data/test.csv')

In [None]:
# train と同じ処理を実施
test['LotFrontage'] = test["LotFrontage"].fillna(test["LotFrontage"].mean())
#test = pd.get_dummies(data=test, columns=["SaleType"])

In [None]:
# 列の差分を追加（train にあって test にない列）
dif = list(filter(lambda x: x not in test.columns, df.columns))
test = pd.concat([test, df[dif].head(0)], axis=1)
test = test.fillna(0)

In [None]:
# 特徴量の設定
testX = test[features]
# 予測
pred = lm.predict(testX)
# 予測結果を新しいカラムに設定
test['SalePrice'] = pred
# ID と予測結果を CSV 出力
test[['Id','SalePrice']].to_csv('submission.csv',index=False)