In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
data_df = pd.read_csv('../data/diabetes.csv')
data_df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


sex -> onehot encoding -> pd.get_dummies

In [3]:
sex_df = pd.get_dummies(data_df['SEX'],prefix='sex')
df = pd.concat((data_df,sex_df),axis=1)
df.drop(['SEX'],axis=1,inplace=True)
df.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,Y,sex_1,sex_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,1,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,1,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,1,0


In [4]:
df['AGE'].describe()

count    442.000000
mean      48.518100
std       13.109028
min       19.000000
25%       38.250000
50%       50.000000
75%       59.000000
max       79.000000
Name: AGE, dtype: float64

In [5]:
def age_level(age):
    if age < 30:
        label = '<30'
    elif age <30:
        label = '20-30'
    elif age <40:
        label = '30-40'
    elif age <50:
        label = '40-50'
    elif age <60:
        label = '50-60'
    elif age <70:
        label = '60-70'
    else: label = '70-80'
    return label
        
df['age_label'] = df['AGE'].apply(age_level)
df.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,Y,sex_1,sex_2,age_label
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0,1,50-60
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,1,0,40-50
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0,1,70-80
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,1,0,<30
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,1,0,50-60


In [6]:
df['age_label'].value_counts()

50-60    125
40-50     97
60-70     90
30-40     73
<30       44
70-80     13
Name: age_label, dtype: int64

In [7]:
df.drop(['AGE'],axis=1,inplace=True)
age_df = pd.get_dummies(df['age_label'],prefix='age')
df = pd.concat((df,age_df),axis=1)
df.columns

Index(['BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y', 'sex_1', 'sex_2',
       'age_label', 'age_30-40', 'age_40-50', 'age_50-60', 'age_60-70',
       'age_70-80', 'age_<30'],
      dtype='object')

In [13]:
df.drop(['age_label'],axis=1,inplace=True)
df.columns

Index(['BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y', 'sex_1', 'sex_2',
       'age_30-40', 'age_40-50', 'age_50-60', 'age_60-70', 'age_70-80',
       'age_<30'],
      dtype='object')

Minmaxscaler

In [14]:
df.head()

Unnamed: 0,BMI,BP,S1,S2,S3,S4,S5,S6,Y,sex_1,sex_2,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,age_<30
0,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0,1,0,0,1,0,0,0
1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,1,0,0,1,0,0,0,0
2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0,1,0,0,0,0,1,0
3,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,1,0,0,0,0,0,0,1
4,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,1,0,0,0,1,0,0,0


In [16]:
NUM_COLS = df.columns.tolist()[:8]
NUM_COLS

['BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']

In [19]:
scaler = MinMaxScaler()
scaled_arr = scaler.fit_transform(df[NUM_COLS])
scaled_df = pd.DataFrame(scaled_arr,columns=NUM_COLS)
scaled_df.head()

Unnamed: 0,BMI,BP,S1,S2,S3,S4,S5,S6
0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394
1,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667
2,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091
3,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697
4,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333


In [24]:
df.drop(NUM_COLS,axis=1,inplace=True)
all_df = pd.concat((df,scaled_df),axis=1)
all_df.head()

Unnamed: 0,Y,sex_1,sex_2,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,age_<30,BMI,BP,S1,S2,S3,S4,S5,S6
0,151,0,1,0,0,1,0,0,0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394
1,75,1,0,0,1,0,0,0,0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667
2,141,0,1,0,0,0,0,1,0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091
3,206,1,0,0,0,0,0,0,1,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697
4,135,1,0,0,0,1,0,0,0,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333


In [25]:
FEAT_COLS = all_df.columns.tolist()[1:]
FEAT_COLS

['sex_1',
 'sex_2',
 'age_30-40',
 'age_40-50',
 'age_50-60',
 'age_60-70',
 'age_70-80',
 'age_<30',
 'BMI',
 'BP',
 'S1',
 'S2',
 'S3',
 'S4',
 'S5',
 'S6']

In [26]:
X = all_df[FEAT_COLS].values
y = all_df['Y']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/5,random_state=20)
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
r2 = lr_model.score(X_test,y_test)
r2

  linalg.lstsq(X, y)


0.43389544766059696

raw data with no feature preprocessing

In [28]:
data_df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [31]:
X1 = data_df[['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']].values
y1 = data_df['Y'].values
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=1/5, random_state=20)
lr_model1 = LinearRegression()
lr_model1.fit(X1_train,y1_train)
r2_raw = lr_model1.score(X1_test,y1_test)
r2_raw

0.4179775463198646

In [35]:
print('feature processing promoted: {:.2f}%'.format((r2-r2_raw)/r2_raw*100))

feature processing promoted: 3.81%


```python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np


# 使用的特征列
NUMERIC_FEAT_COLS = ['AGE', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']
CATEGORY_FEAT_COLS = ['SEX']


def process_features(X_train, X_test):
    """
        特征预处理
    """
    # 1. 对类别型特征做one-hot encoding
    encoder = OneHotEncoder(sparse=False)
    encoded_tr_feat = encoder.fit_transform(X_train[CATEGORY_FEAT_COLS])
    encoded_te_feat = encoder.transform(X_test[CATEGORY_FEAT_COLS])

    # 2. 对数值型特征值做归一化处理
    scaler = MinMaxScaler()
    scaled_tr_feat = scaler.fit_transform(X_train[NUMERIC_FEAT_COLS])
    scaled_te_feat = scaler.transform(X_test[NUMERIC_FEAT_COLS])

    # 3. 特征合并
    X_train_proc = np.hstack((encoded_tr_feat, scaled_tr_feat))
    X_test_proc = np.hstack((encoded_te_feat, scaled_te_feat))

    return X_train_proc, X_test_proc


def main():
    """
        主函数
    """
    diabetes_data = pd.read_csv('./diabetes.csv')

    X = diabetes_data[NUMERIC_FEAT_COLS + CATEGORY_FEAT_COLS]
    y = diabetes_data['Y']

    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=10)

    # 建立线性回归模型
    linear_reg_model = LinearRegression()
    # 模型训练
    linear_reg_model.fit(X_train, y_train)
    # 验证模型
    r2_score = linear_reg_model.score(X_test, y_test)
    print('模型的R2值', r2_score)

    # 数据预处理
    X_train_proc, X_test_proc = process_features(X_train, X_test)
    # 建立线性回归模型
    linear_reg_model2 = LinearRegression()
    # 模型训练
    linear_reg_model2.fit(X_train_proc, y_train)
    # 验证模型
    r2_score2 = linear_reg_model2.score(X_test_proc, y_test)
    print('特征处理后，模型的R2值', r2_score2)

    print('模型提升了{:.2f}%'.format((r2_score2 - r2_score) / r2_score * 100))


if __name__ == '__main__':
    main()
```