# 共享单车数据集上的特征工程

## 导入必要的工具包

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

## 读取数据

In [3]:
df = pd.read_csv('data/day.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


## 对类别型特征进行独热编码

In [8]:
# 列出要进行独热编码的类别型特征
categorical_features = ['season','mnth','weathersit','weekday']

# 将特征转换成object类型，数据类型变为object，才能被get_dummies处理
for col in categorical_features:
    df[col] = df[col].astype('object')
    
# 将这些特征单独保存，以备后面组合起来
df_cat_X = df[categorical_features]

# 开始进行独热编码
df_cat_X = pd.get_dummies(df_cat_X)

df_cat_X.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weathersit_1,weathersit_2,weathersit_3,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## 对数值型特征进行去量纲处理
用sklearn的preprocessing模块进行预处理

In [6]:
#数据可能已经做过处理（取值都在0-1之间），这里用MinMaxScaler再处理一次
from sklearn.preprocessing import MinMaxScaler

# 列出要进行去量纲处理的数值型特征
numerical_features = ['temp','atemp','hum','windspeed']

# 创建实例
scaler = MinMaxScaler()

# 用scaler去量纲
arr_num_X = scaler.fit_transform(df[numerical_features])

# 将numpy.ndarray数据转换成pd.DataFrame数据
df_num_X = pd.DataFrame(data=arr_num_X, columns=numerical_features, index=df.index)
df_num_X.head()

Unnamed: 0,temp,atemp,hum,windspeed
0,0.35517,0.373517,0.82862,0.284606
1,0.379232,0.360541,0.715771,0.466215
2,0.171,0.14483,0.449638,0.46574
3,0.17553,0.174649,0.607131,0.284297
4,0.20912,0.197158,0.449313,0.339143


## 合并数据并保存到文件

In [9]:
# 这里只合并需要作为输入特征X与作为输出特征y的数据
df_FE_Xy = pd.concat([df['instant'], df['yr'], df_cat_X, df_num_X, df['holiday'], df['workingday'], df['cnt']], axis=1, ignore_index=False)

# 保存
df_FE_Xy.to_csv('data/FE_day.csv', index=False)

df_FE_Xy.head()

Unnamed: 0,instant,yr,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,...,weekday_4,weekday_5,weekday_6,temp,atemp,hum,windspeed,holiday,workingday,cnt
0,1,0,1,0,0,0,1,0,0,0,...,0,0,1,0.35517,0.373517,0.82862,0.284606,0,0,985
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0.379232,0.360541,0.715771,0.466215,0,0,801
2,3,0,1,0,0,0,1,0,0,0,...,0,0,0,0.171,0.14483,0.449638,0.46574,0,1,1349
3,4,0,1,0,0,0,1,0,0,0,...,0,0,0,0.17553,0.174649,0.607131,0.284297,0,1,1562
4,5,0,1,0,0,0,1,0,0,0,...,0,0,0,0.20912,0.197158,0.449313,0.339143,0,1,1600


In [10]:
df_FE_Xy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 35 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   instant       731 non-null    int64  
 1   yr            731 non-null    int64  
 2   season_1      731 non-null    uint8  
 3   season_2      731 non-null    uint8  
 4   season_3      731 non-null    uint8  
 5   season_4      731 non-null    uint8  
 6   mnth_1        731 non-null    uint8  
 7   mnth_2        731 non-null    uint8  
 8   mnth_3        731 non-null    uint8  
 9   mnth_4        731 non-null    uint8  
 10  mnth_5        731 non-null    uint8  
 11  mnth_6        731 non-null    uint8  
 12  mnth_7        731 non-null    uint8  
 13  mnth_8        731 non-null    uint8  
 14  mnth_9        731 non-null    uint8  
 15  mnth_10       731 non-null    uint8  
 16  mnth_11       731 non-null    uint8  
 17  mnth_12       731 non-null    uint8  
 18  weathersit_1  731 non-null    