random forest for the most possible original dataset

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

import scipy

In [3]:
plt.rcParams["figure.figsize"] = (15, 10)

## load data

In [39]:
pth_data = '../data/train.csv'
df_origin = pd.read_csv(pth_data)

In [40]:
df_origin

Unnamed: 0,time,height,weight,gender,bmi,age,env_temp,rh,heart_rate,stress_level,skin_temp,eda,TC,TS,Clo,Act
0,2020-10-19 14:00:00,1.64,54.0,1,20.077335,24,77.990,62.122,,17.0,,,4.0,4.0,0.74,1.0
1,2020-10-19 14:30:00,1.64,54.0,1,20.077335,24,78.208,62.321,,60.0,30.748,0.926320,4.0,4.0,0.74,1.1
2,2020-10-19 15:00:00,1.64,54.0,1,20.077335,24,78.514,62.255,,46.0,31.327,1.232405,4.0,4.0,0.74,1.1
3,2020-10-19 15:30:00,1.64,54.0,1,20.077335,24,79.041,61.491,,91.0,31.458,0.375005,4.0,4.0,0.74,1.0
4,2020-10-19 16:00:00,1.64,54.0,1,20.077335,24,79.435,61.071,,71.0,32.085,0.081127,4.0,4.0,0.74,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2791,2020-10-28 21:30:00,1.63,50.0,1,18.818924,23,76.528,40.530,73.0,22.0,31.850,0.114876,2.0,3.0,0.50,1.0
2792,2020-10-28 22:00:00,1.63,50.0,1,18.818924,23,76.876,39.890,,10.0,32.215,0.146231,2.0,3.0,0.50,1.0
2793,2020-10-28 22:30:00,1.63,50.0,1,18.818924,23,77.050,39.710,,5.0,32.656,0.153838,2.0,3.0,0.50,1.0
2794,2020-10-28 23:00:00,1.63,50.0,1,18.818924,23,76.703,39.980,,0.0,31.902,0.160930,2.0,3.0,0.50,1.0


In [41]:
df_origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2796 entries, 0 to 2795
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   time          2796 non-null   object 
 1   height        2796 non-null   float64
 2   weight        2796 non-null   float64
 3   gender        2796 non-null   int64  
 4   bmi           2796 non-null   float64
 5   age           2796 non-null   int64  
 6   env_temp      2796 non-null   float64
 7   rh            2796 non-null   float64
 8   heart_rate    1669 non-null   float64
 9   stress_level  1848 non-null   float64
 10  skin_temp     1874 non-null   float64
 11  eda           2396 non-null   float64
 12  TC            2717 non-null   float64
 13  TS            2712 non-null   float64
 14  Clo           2737 non-null   float64
 15  Act           2718 non-null   float64
dtypes: float64(13), int64(2), object(1)
memory usage: 349.6+ KB


## data process

### choose data: TS >= 3.0

In [42]:
df_origin[df_origin[['TS']].isna().TS].shape
# some na in clo, means unrecorded data

(84, 16)

In [43]:
df_record = df_origin[df_origin['TS'].isna() ^ True]

In [44]:
df_record.shape

(2712, 16)

In [45]:
df_comfort = df_record[df_record['TC'] >= 3.0]

In [46]:
df_comfort.shape

(2528, 16)

### shuffle

In [47]:
df_shuffle = df_comfort.sample(frac=1, random_state=1208).reset_index(drop=True)

In [48]:
X_data = df_shuffle[['height', 'weight', 'gender', 'bmi', 'age', 'rh', \
                      'heart_rate', 'stress_level', 'skin_temp', 'eda', \
                      'TC', 'TS', 'Clo', 'Act']]

In [49]:
X_cols_name = X_data.columns

In [50]:
y_data = df_shuffle['env_temp']

### imputation

In [51]:
# checking for missing data
NAs = pd.concat([df_comfort.isnull().sum()], axis=1, keys=['col'])
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,col
heart_rate,1007
stress_level,830
skin_temp,752
eda,327


In [52]:
imp = SimpleImputer(strategy='mean')

In [53]:
X_data = pd.DataFrame(imp.fit_transform(X_data), columns=X_cols_name)

In [54]:
X_data

Unnamed: 0,height,weight,gender,bmi,age,rh,heart_rate,stress_level,skin_temp,eda,TC,TS,Clo,Act
0,1.60,52.5,1.0,20.507812,24.0,62.985,83.000000,19.500000,32.003449,0.176971,4.0,4.0,0.61,1.1
1,1.77,58.0,-1.0,18.513199,23.0,48.702,79.000000,2.000000,30.933000,0.038822,4.0,4.0,0.36,1.1
2,1.71,64.0,-1.0,21.887076,27.0,47.990,79.333991,33.000000,33.199000,0.475946,4.0,4.0,0.61,1.0
3,1.77,58.0,-1.0,18.513199,23.0,52.445,92.000000,53.000000,30.431000,0.009852,4.0,4.0,0.36,1.0
4,1.77,58.0,-1.0,18.513199,23.0,56.396,79.333991,80.000000,30.087000,0.003597,4.0,4.0,0.36,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2523,1.66,57.0,1.0,20.685150,25.0,52.940,79.333991,2.000000,30.854000,0.008716,4.0,4.0,0.54,1.0
2524,1.69,75.0,-1.0,26.259585,24.0,47.010,79.333991,32.111013,31.300000,0.025626,3.0,4.0,0.36,1.0
2525,1.60,63.0,1.0,24.609375,24.0,45.760,105.000000,32.111013,32.003449,0.013018,4.0,4.0,0.50,1.1
2526,1.74,65.0,-1.0,21.469150,23.0,53.640,79.333991,32.111013,34.150000,0.131368,4.0,7.0,0.54,0.8


### normalize

In [23]:
X_array_normalize, X_norm = normalize(X_data, axis=0, return_norm=True)

In [24]:
X_data = pd.DataFrame(X_array_normalize, columns=X_cols_name)

In [25]:
X_data

Unnamed: 0,height,weight,gender,bmi,age,rh,heart_rate,stress_level,skin_temp,eda,TC,TS,Clo,Act
0,0.018674,0.016305,0.019889,0.018573,0.019347,0.024127,0.020524,0.010107,0.019882,0.000866,0.020182,0.018018,0.022576,0.018588
1,0.020659,0.018013,-0.019889,0.016766,0.018541,0.018656,0.019535,0.001037,0.019217,0.000190,0.020182,0.018018,0.013323,0.018588
2,0.019958,0.019877,-0.019889,0.019822,0.021765,0.018383,0.019618,0.017104,0.020624,0.002329,0.020182,0.018018,0.022576,0.016898
3,0.020659,0.018013,-0.019889,0.016766,0.018541,0.020090,0.022750,0.027470,0.018905,0.000048,0.020182,0.018018,0.013323,0.016898
4,0.020659,0.018013,-0.019889,0.016766,0.018541,0.021603,0.019618,0.041464,0.018691,0.000018,0.020182,0.018018,0.013323,0.016898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2523,0.019375,0.017703,0.019889,0.018733,0.020153,0.020279,0.019618,0.001037,0.019168,0.000043,0.020182,0.018018,0.019985,0.016898
2524,0.019725,0.023293,-0.019889,0.023782,0.019347,0.018008,0.019618,0.016643,0.019445,0.000125,0.015137,0.018018,0.013323,0.016898
2525,0.018674,0.019566,0.019889,0.022287,0.019347,0.017529,0.025964,0.016643,0.019882,0.000064,0.020182,0.018018,0.018505,0.018588
2526,0.020308,0.020188,-0.019889,0.019443,0.018541,0.020548,0.019618,0.016643,0.021215,0.000643,0.020182,0.031532,0.019985,0.013519


In [26]:
X_norm

array([  85.67889413, 3219.80907819,   50.27922036, 1104.20171125,
       1240.52690418, 2610.52082164, 4044.03947436, 1929.40521588,
       1609.69440491,  204.39639127,  198.19434906,  221.99549545,
         27.02021465,   59.17786748])

### train test split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=1208)

## random forest

In [56]:
rfr = RandomForestRegressor()

In [57]:
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [58]:
y_pred = rfr.predict(X_test)

In [59]:
# mean squared error
mean_squared_error(y_test, y_pred)

2.8960918317701094

In [60]:
# R2 score
r2_score(y_test, y_pred)

0.6017344110285625