<a href="https://colab.research.google.com/github/danlingzhou16/stat390/blob/Danling/Danling_STAT390_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression

# stats models
import statsmodels.formula.api as smf
import time as time

In [2]:
df = pd.read_csv('/content/complete_top_10_imputation_rvsd.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,new_deceased,population,population_male,population_female,latitude,longitude,area_sq_km,life_expectancy,mobility_workplaces,...,stay_at_home_requirements,restrictions_on_internal_movement,international_travel_controls,income_support,debt_relief,public_information_campaigns,contact_tracing,testing_policy,facial_coverings,vaccination_policy
0,0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,1717856.0,78.0,-10.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
1,1,0.0,733391.0,424916.0,391925.0,64.0,-150.0,1717856.0,78.0,-10.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,2,0.0,733391.0,424916.0,391925.0,64.0,-150.0,1717856.0,78.0,-10.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,3,0.0,733391.0,424916.0,391925.0,64.0,-150.0,1717856.0,78.0,-10.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,4,0.0,733391.0,424916.0,391925.0,64.0,-150.0,1717856.0,78.0,-10.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0


In [8]:
# making the training and testing datasets according to the time
train = df.loc[df.date < '2022-01-01']
test = df.loc[df.date >= '2022-01-01']
X_train = train.drop(columns = ['Unnamed: 0', 'date','new_confirmed','country_name'])
y_train = train.new_confirmed
X_test = test.drop(columns = ['Unnamed: 0', 'date','new_confirmed','country_name'])
y_test = test.new_confirmed

In [9]:
X_train_dum= pd.get_dummies(X_train, drop_first = True)
X_test_dum = pd.get_dummies(X_test, drop_first = True)

Decision tree

In [12]:
# a basic one
clf = DecisionTreeRegressor(random_state=0, max_depth=10, max_features=80)
clf.fit(X_train_dum,y_train)

In [13]:
print(clf.get_n_leaves(), 'leaves and', clf.get_depth(), 'depth')

734 leaves and 10 depth


In [38]:
y_fit = clf.predict(X_train_dum)
y_pred = clf.predict(X_test_dum)
print('RMSE on train:', mean_squared_error(y_train,y_fit, squared = False))
print('RMSE on test:', mean_squared_error(y_test,y_pred, squared = False))
# this tree needs to be tuned

RMSE on train: 1230.7004388799885
RMSE on test: 7049.449653815186


In [47]:
start_time = time.time()
param_grid = {
    'max_depth': range(4,14),
    'max_leaf_nodes': range(6,150,4),
    'max_features': range(3, 10)
}

skf = StratifiedKFold(n_splits = 5)
#Minimizing FNR is equivalent to maximizing recall
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=1), param_grid, scoring='neg_root_mean_squared_error',
                            cv=skf, n_jobs=-1, verbose = True)
grid_search.fit(X_train_dum, y_train)

# make the predictions
y_pred = grid_search.predict(X_test_dum)

print('Best RMSE Through Grid Search : %.3f'%-grid_search.best_score_)

print('Best params')
print(grid_search.best_params_)

print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Fitting 5 folds for each of 2520 candidates, totalling 12600 fits




Best RMSE Through Grid Search : 1804.506
Best params
{'max_depth': 12, 'max_features': 9, 'max_leaf_nodes': 142}
Time taken =  4  minutes


In [49]:
clf1 = grid_search.best_estimator_
y_pred1 = clf1.predict(X_test_dum)
print('RMSE on test:', mean_squared_error(y_test,y_pred1, squared = False))

RMSE on test: 7235.850980766448


Tuning the tree does not improve the performance.

In [50]:
# try a finer tune
start_time = time.time()
param_grid2 = {
    'max_depth': np.arange(10,16),
    'max_leaf_nodes': np.arange(130,150,2),
    'max_features': np.arange(7,12),
    'ccp_alpha': np.arange(0,0.4,0.05)
}
grid_search2 = GridSearchCV(DecisionTreeRegressor(random_state=1), param_grid2, scoring='neg_root_mean_squared_error',
                            cv=skf, n_jobs=-1, verbose = True)
grid_search2.fit(X_train_dum, y_train)

# make the predictions
y_pred = grid_search2.predict(X_test_dum)

print('Best RMSE Through Grid Search : %.3f'%-grid_search2.best_score_)

print('Best params')
print(grid_search2.best_params_)

print("Time taken = ", round((time.time()-start_time)/60), " minutes")


Fitting 5 folds for each of 2400 candidates, totalling 12000 fits




Best RMSE Through Grid Search : 1804.506
Best params
{'ccp_alpha': 0.0, 'max_depth': 12, 'max_features': 9, 'max_leaf_nodes': 142}
Time taken =  4  minutes


This looks like the best decision tree.

In [61]:
# feature importance
importance_table = pd.DataFrame(data = [X_train_dum.columns, clf1.feature_importances_]).T
importance_table.columns = ['feature','importance']
ordered_importance_table = importance_table.sort_values('importance', ascending = False)

In [62]:
ordered_importance_table.loc[ordered_importance_table.importance != 0]

Unnamed: 0,feature,importance
0,new_deceased,0.425971
113,SP.DYN.LE00.IN,0.10193
7,life_expectancy,0.083669
159,cumulative_persons_fully_vaccinated,0.077934
8,mobility_workplaces,0.06123
158,new_persons_fully_vaccinated,0.04443
2,population_male,0.02775
1,population,0.025161
160,school_closing,0.023254
172,testing_policy,0.022


SP.DYN.LE00.IN: life expectancy at birth

EN.FSH.THRD.NO: fish species, theatened

NY.ADJ.DCO2.GN.ZS: carbon dioxide damage (% of GNI)

AG.SRF.TOTL.K2 : surface area of a country, including area under inland bodies of water and some coastal waterways.

TM.VAL.MRCH.WL.CD: Merchandise imports by the reporting economy