# Label and OneHot Encoding

## 1 导入测试数据

In [1]:
import pandas as pd

In [4]:
data = pd.read_csv('test_data.csv')

In [5]:
data

Unnamed: 0,ID,Country,Population
0,1,Japan,127185332
1,2,U.S,326766748
2,3,India,1354051854
3,4,China,1415045928
4,5,U.S,326766748
5,6,India,1354051854


In [6]:
data.shape

(6, 3)

### 1.1 分割数据集 test file

In [8]:
data_train = pd.read_csv('insurance.csv')

In [9]:
data_train.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [10]:
data_train.shape

(1338, 7)

In [22]:
list1 = [1,2,3,4,5,6,7]
list1[-3:]

[5, 6, 7]

**选取数据集中的后几行作为测试数据**
* df.iloc[-k:]
* df.tail(k)

In [27]:
data_test = data_train.iloc[-137:]

In [29]:
data_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1201,46,male,40.375,2,no,northwest,8733.22925
1202,22,male,32.11,0,no,northwest,2055.3249
1203,51,male,32.3,1,no,northeast,9964.06
1204,18,female,27.28,3,yes,southeast,18223.4512
1205,35,male,17.86,1,no,northwest,5116.5004


In [30]:
data_test.to_csv('insuranceTest.csv')

## 2 数据集预处理

**合并数据集并做统一预处理**

In [31]:
mergedata = data_train.append(data_test)

In [32]:
mergedata.shape

(1475, 7)

In [33]:
test_count = len(data_test)

In [34]:
train_count = len(mergedata)-test_count

In [35]:
print("Train count number: ",train_count," Test count number: ",test_count," Total count number: ",train_count+test_count)

Train count number:  1338  Test count number:  137  Total count number:  1475


**处理字符型的列  选择---》字符-->数值**

* 采用pandas的select_dtypes 进行选择

In [41]:
X_copy1 = mergedata.copy()

In [42]:
X_copy1.shape

(1475, 7)

In [43]:
X_copy2 = mergedata.select_dtypes(include=['object'])

In [44]:
X_copy2.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [45]:
X_encoder = X_copy2.copy()

In [46]:
X_encoder.shape

(1475, 3)

In [47]:
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from math import sqrt
from sklearn.metrics import mean_squared_error

## 3 Label Encoder

**LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. If a label repeats it assigns the same value to as assigned earlier**

**如果需要用到字符型到model中，就需要通过label Encoder将字符转换为数值**

In [48]:
X_encoder = X_encoder.apply(LabelEncoder().fit_transform)


In [49]:
X_encoder.head()

Unnamed: 0,sex,smoker,region
0,0,1,3
1,1,0,2
2,1,0,2
3,1,0,1
4,1,0,1


In [50]:
mergedata_numercial = mergedata.drop(X_copy2.columns,axis=1)

In [51]:
mergedata_numercial.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


## 4 合并数据并划分训练集和测试集

In [52]:
FinalData = pd.concat([mergedata_numercial,X_encoder],axis=1)

In [53]:
FinalData.head()

Unnamed: 0,age,bmi,children,charges,sex,smoker,region
0,19,27.9,0,16884.924,0,1,3
1,18,33.77,1,1725.5523,1,0,2
2,28,33.0,3,4449.462,1,0,2
3,33,22.705,0,21984.47061,1,0,1
4,32,28.88,0,3866.8552,1,0,1


In [54]:
train_data = FinalData[:train_count]

In [55]:
test_data = FinalData[train_count:]

In [57]:
train_data_y = train_data['charges']

In [58]:
train_data_x = train_data.drop(['charges'],axis=1)

In [59]:
train_data_x.head()

Unnamed: 0,age,bmi,children,sex,smoker,region
0,19,27.9,0,0,1,3
1,18,33.77,1,1,0,2
2,28,33.0,3,1,0,2
3,33,22.705,0,1,0,1
4,32,28.88,0,1,0,1


In [60]:
test_data = test_data.drop(['charges'],axis=1)

In [61]:
X_train,X_test,y_train,y_test = train_test_split(train_data_x,train_data_y,test_size=0.3)

In [62]:
X_test.head()

Unnamed: 0,age,bmi,children,sex,smoker,region
153,42,23.37,0,0,1,0
5,31,25.74,0,0,0,2
167,32,33.155,3,0,0,1
226,28,38.06,0,1,0,2
980,54,25.46,1,1,0,0


## 5 导入模型并进行训练

In [63]:
clf = xgboost.XGBRegressor()
clf.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [64]:
y_testpred = clf.predict(X_test)

In [66]:
y_pred = clf.predict(test_data)

In [67]:
y_pred

array([10586.381 ,  3226.0432, 10294.449 , 20134.17  ,  5058.389 ,
       19660.812 , 39245.008 , 19832.764 , 16949.72  ,  6268.9546,
        7237.955 ,  1957.9291, 13781.33  ,  5098.109 ,  6028.739 ,
        6533.618 ,  6589.3096, 42698.34  ,  8271.821 ,  6375.398 ,
        8180.7397,  9337.903 , 18248.75  ,  7494.0396,  5355.5396,
        6628.201 ,  7723.7773, 11179.941 , 11518.653 , 49593.19  ,
       17075.68  , 13495.032 , 11223.122 , 10530.978 ,  3460.0693,
       15210.431 , 12230.452 ,  7869.3247,  6053.183 , 47783.76  ,
       48671.355 ,  5009.6826,  4780.2295,  3004.7024,  6320.687 ,
        9279.299 ,  6474.276 ,  3143.4014, 39259.08  , 20831.719 ,
        1344.7002, 19069.875 ,  7463.878 ,  6402.8354,  7452.7275,
       12094.227 , 14104.661 , 14072.281 , 13239.888 ,  4713.409 ,
        5472.204 ,  7247.1567,  8053.16  , 11785.233 , 26818.973 ,
       12256.995 , 35314.965 ,  3520.4294, 10271.51  ,  4464.79  ,
        3625.7922,  7580.8467,  7234.275 , 19345.234 , 11223.1

In [68]:
rms = sqrt(mean_squared_error(y_test,y_testpred))

In [69]:
print("RMSE of Label Encoder:",rms)

RMSE of Label Encoder: 4638.548305424293


## One-Hot-Encoder 处理方式

In [73]:
X_one_hot = X_copy2.copy()

In [74]:
X_one_hot = pd.get_dummies(X_one_hot,columns=['sex','region','smoker'])

In [76]:
X_one_hot.head(10)

Unnamed: 0,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
0,1,0,0,0,0,1,0,1
1,0,1,0,0,1,0,1,0
2,0,1,0,0,1,0,1,0
3,0,1,0,1,0,0,1,0
4,0,1,0,1,0,0,1,0
5,1,0,0,0,1,0,1,0
6,1,0,0,0,1,0,1,0
7,1,0,0,1,0,0,1,0
8,0,1,1,0,0,0,1,0
9,1,0,0,1,0,0,1,0


In [77]:
mergedata_numercial = mergedata.drop(['sex','region','smoker'],axis=1)

**合并数据**

In [78]:
FinalData_hot = pd.concat([mergedata_numercial,X_one_hot],axis=1)

In [79]:
FinalData_hot.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
0,19,27.9,0,16884.924,1,0,0,0,0,1,0,1
1,18,33.77,1,1725.5523,0,1,0,0,1,0,1,0
2,28,33.0,3,4449.462,0,1,0,0,1,0,1,0
3,33,22.705,0,21984.47061,0,1,0,1,0,0,1,0
4,32,28.88,0,3866.8552,0,1,0,1,0,0,1,0


In [80]:
train_data = FinalData_hot[:train_count]

In [81]:
test_data = FinalData_hot[train_count:]

In [82]:
trainy = train_data['charges']

In [83]:
trainx = train_data.drop(['charges'],axis=1)

In [85]:
test_data = test_data.drop(['charges'],axis=1)

In [86]:
test_data.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
1201,46,40.375,2,0,1,0,1,0,0,1,0
1202,22,32.11,0,0,1,0,1,0,0,1,0
1203,51,32.3,1,0,1,1,0,0,0,1,0
1204,18,27.28,3,1,0,0,0,1,0,0,1
1205,35,17.86,1,0,1,0,1,0,0,1,0


In [87]:
X_train,X_test,y_train,y_test = train_test_split(trainx,trainy,test_size=0.3)

In [88]:
clf = xgboost.XGBRegressor()
clf.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [89]:
y_testpred = clf.predict(X_test)
y_pred = clf.predict(test_data)

In [90]:
rms = sqrt(mean_squared_error(y_test,y_testpred))

In [92]:
print("RMSE of One Hot Encoder:",rms)

RMSE of One Hot Encoder: 4897.616109376038
