In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

- pd.get_dummies()
- OneHotEncoder()

In [4]:
NUMERIC_FEAT_COLS = ['sqft_living', 'sqft_above', 'sqft_basement', 'long', 'lat']
CATEGORY_FEAT_COLS = ['waterfront']

In [9]:
data_df = pd.read_csv('../data/house_data.csv',usecols=NUMERIC_FEAT_COLS+CATEGORY_FEAT_COLS+['price'])
data_df.head()

Unnamed: 0,price,sqft_living,waterfront,sqft_above,sqft_basement,lat,long
0,221900.0,1180,0,1180,0,47.5112,-122.257
1,538000.0,2570,0,2170,400,47.721,-122.319
2,180000.0,770,0,770,0,47.7379,-122.233
3,604000.0,1960,0,1050,910,47.5208,-122.393
4,510000.0,1680,0,1680,0,47.6168,-122.045


In [10]:
data_df.waterfront.value_counts()

0    21450
1      163
Name: waterfront, dtype: int64

In [41]:
wf_df = pd.get_dummies(data_df['waterfront'],prefix='wf')
wf_df.head()
# df = pd.concat((data_df,wf_df),axis=1)
# df.drop(['waterfront'], axis=1, inplace=True)
# df.head()

Unnamed: 0,wf_0,wf_1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [28]:
scaler = MinMaxScaler()
scaled_arr = scaler.fit_transform(df[NUMERIC_FEAT_COLS])
scaled_arr

array([[0.06716981, 0.09758772, 0.        , 0.21760797, 0.57149751],
       [0.17207547, 0.20614035, 0.08298755, 0.16611296, 0.90895931],
       [0.03622642, 0.05263158, 0.        , 0.23754153, 0.93614283],
       ...,
       [0.05509434, 0.08004386, 0.        , 0.18272425, 0.70532411],
       [0.09886792, 0.14364035, 0.        , 0.37375415, 0.60897539],
       [0.05509434, 0.08004386, 0.        , 0.18272425, 0.70484156]])

In [37]:
scaled_df = pd.DataFrame(scaled_arr, columns=NUMERIC_FEAT_COLS)
all_df = pd.concat((scaled_df,wf_df),axis=1)
all_df.head()

Unnamed: 0,sqft_living,sqft_above,sqft_basement,long,lat,wf_0,wf_1
0,0.06717,0.097588,0.0,0.217608,0.571498,1,0
1,0.172075,0.20614,0.082988,0.166113,0.908959,1,0
2,0.036226,0.052632,0.0,0.237542,0.936143,1,0
3,0.126038,0.083333,0.188797,0.104651,0.586939,1,0
4,0.104906,0.152412,0.0,0.393688,0.741354,1,0


In [38]:
X = all_df.values
y = data_df['price']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/3,random_state=10)

In [48]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
r2_proc = lr_model.score(X_test,y_test)
r2_proc

0.6273258697491765

In [42]:
X1 = data_df[NUMERIC_FEAT_COLS+CATEGORY_FEAT_COLS].values

In [44]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y,test_size=1/3,random_state=10)

In [49]:
lr_model1 = LinearRegression()
lr_model1.fit(X1_train,y1_train)
r2 = lr_model1.score(X1_test,y1_test)
r2

0.6271257979584459

In [54]:
((r2_proc-r2)/r2)*100

0.031902975668020195

In [None]:
```python
"""
    任务：房屋价格预测
"""
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

DATA_FILE = './data_ai/house_data.csv'

# 使用的特征列
NUMERIC_FEAT_COLS = ['sqft_living', 'sqft_above', 'sqft_basement', 'long', 'lat']
CATEGORY_FEAT_COLS = ['waterfront']


def process_features(X_train, X_test):
    """
        特征预处理
    """
    # 1. 对类别型特征做one-hot encoding
    encoder = OneHotEncoder(sparse=False)
    encoded_tr_feat = encoder.fit_transform(X_train[CATEGORY_FEAT_COLS])
    encoded_te_feat = encoder.transform(X_test[CATEGORY_FEAT_COLS])

    # 2. 对数值型特征值做归一化处理
    scaler = MinMaxScaler()
    scaled_tr_feat = scaler.fit_transform(X_train[NUMERIC_FEAT_COLS])
    scaled_te_feat = scaler.transform(X_test[NUMERIC_FEAT_COLS])

    # 3. 特征合并
    X_train_proc = np.hstack((encoded_tr_feat, scaled_tr_feat))
    X_test_proc = np.hstack((encoded_te_feat, scaled_te_feat))

    return X_train_proc, X_test_proc


def main():
    """
        主函数
    """
    house_data = pd.read_csv(DATA_FILE, usecols=NUMERIC_FEAT_COLS + CATEGORY_FEAT_COLS + ['price'])

    X = house_data[NUMERIC_FEAT_COLS + CATEGORY_FEAT_COLS]
    y = house_data['price']

    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=10)

    # 建立线性回归模型
    linear_reg_model = LinearRegression()
    # 模型训练
    linear_reg_model.fit(X_train, y_train)
    # 验证模型
    r2_score = linear_reg_model.score(X_test, y_test)
    print('模型的R2值', r2_score)

    # 数据预处理
    X_train_proc, X_test_proc = process_features(X_train, X_test)
    # 建立线性回归模型
    linear_reg_model2 = LinearRegression()
    # 模型训练
    linear_reg_model2.fit(X_train_proc, y_train)
    # 验证模型
    r2_score2 = linear_reg_model2.score(X_test_proc, y_test)
    print('特征处理后，模型的R2值', r2_score2)

    print('模型提升了{:.2f}%'.format( (r2_score2 - r2_score) / r2_score * 100) )


if __name__ == '__main__':
    main()
```