In [39]:
#ライブラリのインポート
import numpy as np
import pandas as pd
import seaborn as sns

In [40]:
pip install japanize-matplotlib



In [41]:
import japanize_matplotlib

In [42]:
#データの読み込み
df = pd.read_csv('/content/california_housing_cleansing.csv')
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,所得,築年数,地域人口,緯度,経度,住宅価格,部屋数/人,寝室数/人
0,8.3252,41.0,322.0,37.88,-122.23,4.526,2.732919,0.400621
1,8.3014,21.0,2401.0,37.86,-122.22,3.585,2.956685,0.460641
2,2.0804,42.0,1206.0,37.84,-122.26,2.267,2.118574,0.55141
3,2.125,50.0,697.0,37.85,-122.26,1.4,1.606887,0.406026
4,1.9911,50.0,990.0,37.84,-122.26,1.587,2.261616,0.459596


In [43]:
# 説明変数の定義
X = df.drop(columns=['住宅価格']).to_numpy()
# 目的変数の定義
y = df['住宅価格'].to_numpy()

# 学習データとテストデータに分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [44]:
#標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

#標準化の変換モデルを利用したスケールの変換
X_train_scaled = scaler.transform(X_train)

#変換後の状態を表示
df_X_train_scaled = pd.DataFrame(X_train_scaled, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_train_scaled.head()

df_X_train_scaled.describe()

Unnamed: 0,所得,築年数,地域人口,緯度,経度,部屋数,寝室数
count,13000.0,13000.0,13000.0,13000.0,13000.0,13000.0,13000.0
mean,4.533263e-15,7.651999000000001e-17,8.280556000000001e-17,-3.489093e-14,-8.354643e-14,1.134163e-14,-1.848504e-15
std,1.000038,1.000038,1.000038,1.000038,1.000038,1.000038,1.000038
min,-2.038837,-2.26267,-1.268592,-1.41863,-2.420081,-1.850904,-1.851601
25%,-0.7374752,-0.8678601,-0.5701945,-0.7793341,-1.034727,-0.4232155,-0.3998158
50%,-0.1394514,0.003896079,-0.229197,-0.6300096,0.5225325,-0.02454245,-0.1384252
75%,0.5805695,0.7884766,0.2622279,0.9752284,0.7652223,0.2989548,0.185558
max,7.211958,2.096111,29.52823,2.935112,2.615732,37.90064,39.16628


In [45]:
#説明変数のテストデータX_testを変換
X_test_scaled  = scaler.transform(X_test)

df_X_test = pd.DataFrame(X_test, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_test.head()

df_X_test_scaled = pd.DataFrame(X_test_scaled, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_test_scaled.head()

#各要素の平均値と標準偏差を確認
df_X_test_scaled.describe()

Unnamed: 0,所得,築年数,地域人口,緯度,経度,部屋数,寝室数
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,-0.005149,0.008292,-0.020433,0.004869,-0.002439,0.030341,0.037607
std,1.005489,0.988514,1.01297,1.007327,1.005098,1.245476,1.404714
min,-2.038837,-2.175494,-1.272908,-1.423296,-2.435249,-1.752933,-1.772656
25%,-0.748863,-0.780684,-0.583144,-0.784,-1.065063,-0.407443,-0.38038
50%,-0.154763,0.003896,-0.253369,-0.63001,0.517476,-0.011944,-0.123381
75%,0.592882,0.788477,0.258559,0.965896,0.765222,0.303824,0.197324
max,7.211958,2.096111,23.385096,2.967777,2.534835,51.258832,64.690468


In [46]:
#予測モデルの学習
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR

class ModelSet:
    def __init__(self):
        self.L = LinearRegression()
        self.R = Ridge()
        self.LS = LinearSVR()

model = ModelSet()

# 学習
model.L.fit(X_train_scaled, y_train)
model.R.fit(X_train_scaled, y_train)
model.LS.fit(X_train_scaled, y_train)

# 各モデルのスコアを計算
score_L = model.L.score(X_test, y_test)
score_R = model.R.score(X_test, y_test)
score_LS = model.LS.score(X_test, y_test)

# 比較して最も良いモデルを選択
if score_L > score_R and score_L > score_LS:
    model_best = model.L
    print("線形回帰が最も精度が良いモデル")
elif score_R > score_L and score_R > score_LS:
    model_best = model.R
    print("リッジ回帰が最も精度が良いモデル")
else:
    model_best = model.LS
    print("サポートベクター回帰が最も精度が良いモデル")

# 絞り込まれたモデルのスコアを確認
print("最も良いモデルのスコア:", model_best.score(X_train_scaled, y_train))


リッジ回帰が最も精度が良いモデル
最も良いモデルのスコア: 0.5967952622069931




In [47]:
#予測モデルの評価
model_best.score(X_train_scaled, y_train)
model_best.score(X_test_scaled, y_test)

0.5752727372946504

In [48]:
#予測
X_new = np.array([[8, 41, 500, 37, -120, 1, 0.2],
                  [2, 10, 2000, 38, -122, 1.5, 0.5],
                  [1, 25, 1000, 38, -121, 2, 1]])

X_new_scaled = scaler.transform(X_new)
X_new_scaled

model_best.predict(X_new_scaled)


#住宅価格を予測する計算式
print(model_best.coef_)
print(model_best.intercept_)


[ 0.63968323  0.07949608  0.01978869 -0.86499759 -0.81076497 -0.21657391
  0.36148728]
1.8977734592306716
