In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error,accuracy_score
from sklearn.model_selection import GridSearchCV
import tensorflow as tf 
seed = 2021 
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
df = pd.read_csv('reg2/y_staff.csv')
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   alone_household  1728 non-null   int64  
 1   alone_hh_ratio   1728 non-null   float64
 2   silver_hall      1728 non-null   int64  
 3   welfare_facil_w  1728 non-null   float64
 4   jobforpay        1728 non-null   float64
 5   stress           1728 non-null   float64
 6   melancholy       1728 non-null   float64
 7   poor_60          1728 non-null   float64
 8   ratio_old        1728 non-null   float64
 9   infra            1728 non-null   int64  
 10  welfare_staff    1728 non-null   float64
dtypes: float64(8), int64(3)
memory usage: 148.6 KB


Unnamed: 0,alone_household,alone_hh_ratio,silver_hall,welfare_facil_w,jobforpay,stress,melancholy,poor_60,ratio_old,infra,welfare_staff
0,12202,9.262183,515,0.004881,15053.853600,26.0,11.5,23155.860,12.6,438,0.999506
1,12202,9.262183,515,0.004881,15100.353360,26.0,11.5,23227.386,12.6,168,0.999506
2,12202,9.262183,515,0.004881,15154.307280,26.0,11.5,23310.378,12.7,132,0.999506
3,12202,9.262183,515,0.004881,15204.711600,26.0,11.5,23387.910,12.7,508,0.999506
4,12202,9.262183,515,0.004881,15256.890720,26.0,11.5,23468.172,12.7,136,0.999506
...,...,...,...,...,...,...,...,...,...,...,...
1723,5057,20.844153,525,0.005512,6884.335752,9.8,1.5,8754.984,39.2,77,0.999772
1724,5057,20.844153,525,0.005512,6893.054616,9.8,1.5,8766.072,39.4,72,0.999772
1725,5057,20.844153,525,0.005512,6900.980856,9.8,1.5,8776.152,39.4,67,0.999772
1726,5057,20.844153,525,0.005512,6907.718160,9.8,1.5,8784.720,39.5,98,0.999772


## Random Forest Regression

In [3]:
from sklearn.preprocessing import StandardScaler

std_df = StandardScaler().fit_transform(df[df.columns.difference(['welfare_staff',"alone_hh_ratio"]) ])
std_df = pd.DataFrame(std_df, index=df[df.columns.difference(['welfare_staff',"alone_hh_ratio"])].index, 
columns=df[df.columns.difference(['welfare_staff',"alone_hh_ratio"])].columns)
std_df.head()

Unnamed: 0,alone_household,infra,jobforpay,melancholy,poor_60,ratio_old,silver_hall,stress,welfare_facil_w
0,0.889008,0.646985,0.519054,1.45985,0.700293,-1.117329,0.656856,1.293084,-0.988856
1,0.889008,-0.212244,0.523872,1.45985,0.705735,-1.117329,0.656856,1.293084,-0.988856
2,0.889008,-0.326808,0.529463,1.45985,0.712049,-1.106476,0.656856,1.293084,-0.988856
3,0.889008,0.869748,0.534685,1.45985,0.717947,-1.106476,0.656856,1.293084,-0.988856
4,0.889008,-0.314078,0.540092,1.45985,0.724054,-1.106476,0.656856,1.293084,-0.988856


In [4]:
std_df=pd.concat([std_df,df[['welfare_staff',"alone_hh_ratio"]]],axis=1)
std_df

Unnamed: 0,alone_household,infra,jobforpay,melancholy,poor_60,ratio_old,silver_hall,stress,welfare_facil_w,welfare_staff,alone_hh_ratio
0,0.889008,0.646985,0.519054,1.459850,0.700293,-1.117329,0.656856,1.293084,-0.988856,0.999506,9.262183
1,0.889008,-0.212244,0.523872,1.459850,0.705735,-1.117329,0.656856,1.293084,-0.988856,0.999506,9.262183
2,0.889008,-0.326808,0.529463,1.459850,0.712049,-1.106476,0.656856,1.293084,-0.988856,0.999506,9.262183
3,0.889008,0.869748,0.534685,1.459850,0.717947,-1.106476,0.656856,1.293084,-0.988856,0.999506,9.262183
4,0.889008,-0.314078,0.540092,1.459850,0.724054,-1.106476,0.656856,1.293084,-0.988856,0.999506,9.262183
...,...,...,...,...,...,...,...,...,...,...,...
1723,-0.404958,-0.501836,-0.327426,-1.367186,-0.395305,1.769530,0.716775,-1.450691,0.715974,0.999772,20.844153
1724,-0.404958,-0.517747,-0.326523,-1.367186,-0.394461,1.791235,0.716775,-1.450691,0.715974,0.999772,20.844153
1725,-0.404958,-0.533659,-0.325702,-1.367186,-0.393694,1.791235,0.716775,-1.450691,0.715974,0.999772,20.844153
1726,-0.404958,-0.435007,-0.325003,-1.367186,-0.393042,1.802088,0.716775,-1.450691,0.715974,0.999772,20.844153


In [5]:
X=std_df[std_df.columns.difference(["alone_household","welfare_staff","alone_hh_ratio"])]
Y=std_df["welfare_staff"]

In [6]:
X

Unnamed: 0,infra,jobforpay,melancholy,poor_60,ratio_old,silver_hall,stress,welfare_facil_w
0,0.646985,0.519054,1.459850,0.700293,-1.117329,0.656856,1.293084,-0.988856
1,-0.212244,0.523872,1.459850,0.705735,-1.117329,0.656856,1.293084,-0.988856
2,-0.326808,0.529463,1.459850,0.712049,-1.106476,0.656856,1.293084,-0.988856
3,0.869748,0.534685,1.459850,0.717947,-1.106476,0.656856,1.293084,-0.988856
4,-0.314078,0.540092,1.459850,0.724054,-1.106476,0.656856,1.293084,-0.988856
...,...,...,...,...,...,...,...,...
1723,-0.501836,-0.327426,-1.367186,-0.395305,1.769530,0.716775,-1.450691,0.715974
1724,-0.517747,-0.326523,-1.367186,-0.394461,1.791235,0.716775,-1.450691,0.715974
1725,-0.533659,-0.325702,-1.367186,-0.393694,1.791235,0.716775,-1.450691,0.715974
1726,-0.435007,-0.325003,-1.367186,-0.393042,1.802088,0.716775,-1.450691,0.715974


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=seed
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1209, 8), (519, 8), (1209,), (519,))

In [8]:
params = { 'n_estimators' : [10, 50, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 16, 18],
           'min_samples_split' : [8, 16, 20]
            }

In [9]:
rf = RandomForestRegressor(random_state=0, n_jobs=-1)
grid = GridSearchCV(rf, param_grid=params, cv=2, n_jobs=-1)
grid.fit(X_train,y_train)

GridSearchCV(cv=2, estimator=RandomForestRegressor(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [8, 12, 16, 18],
                         'min_samples_split': [8, 16, 20],
                         'n_estimators': [10, 50, 100]})

In [10]:
print(grid.best_score_)
print(grid.best_params_)

0.9778880961917631
{'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}


In [11]:
rf= RandomForestRegressor(n_estimators=100, max_depth=12, min_samples_leaf=8,
                            min_samples_split=8, random_state=seed)

In [12]:
rf.fit(X_train,y_train)

y_pred_staff = rf.predict(X_test)

In [13]:
mse = mean_squared_error(y_test, y_pred_staff)
rmse = np.sqrt(mse)
r2_points = r2_score(y_test, y_pred_staff)
print("RMSE:", rmse)
print("R^2:", r2_points)

RMSE: 5.766476894866039e-05
R^2: 0.9966009530030758


## Decision Tree Regressor

In [14]:
dtr = DecisionTreeRegressor()
param_grid = {'criterion':['mse'], 'max_depth':[None,2,3,4,5,6,8,10,20]}

In [15]:
grid = GridSearchCV(dtr, param_grid=param_grid)

grid.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse'],
                         'max_depth': [None, 2, 3, 4, 5, 6, 8, 10, 20]})

In [16]:
print(grid.best_score_)
print(grid.best_params_)

0.9970343453223464
{'criterion': 'mse', 'max_depth': 20}


In [17]:
dtr = DecisionTreeRegressor(criterion='mse', max_depth=20)
dtr.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=20)

In [22]:
dt_pred_staff = dtr.predict(X_test)

In [23]:
mse = mean_squared_error(y_test, dt_pred_staff)
rmse = np.sqrt(mse)
r2_points = r2_score(y_test, dt_pred_staff)
print("RMSE:", rmse)
print("R^2:", r2_points)

RMSE: 3.192519338537347e-05
R^2: 0.9989581549557668
