## XGboost regressor

#### More Grid searches and continual parameter tuning

In [97]:
import os
import pandas as pd
import numpy as np
import seaborn as sns


from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#### Read in merged data

In [53]:
filepath = '../../processed_data/all_merged.csv'
enviro = pd.read_csv(filepath)

In [5]:
enviro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25444 entries, 0 to 25443
Data columns (total 63 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   zip                          25444 non-null  int64  
 1   total population             25444 non-null  int64  
 2   ozone                        25333 non-null  float64
 3   ozone pctl                   25333 non-null  float64
 4   pm2.5                        25286 non-null  float64
 5   pm2.5 pctl                   25286 non-null  float64
 6   diesel pm                    25444 non-null  float64
 7   diesel pm pctl               25444 non-null  float64
 8   pesticides                   25444 non-null  float64
 9   pesticides pctl              25444 non-null  float64
 10  traffic                      25353 non-null  float64
 11  traffic pctl                 25353 non-null  float64
 12  cleanup sites                25444 non-null  float64
 13  cleanup sites pc

## Third XGBoost Regression model with different Grid search parameters, specific features chosen are those high correlation and removed health effects that are other target variables.

##### cutting out percentiles , lat/lon, zip, census tract, "scores"

In [67]:
X = enviro[['total population',
 'ozone',
 'pm2.5',
 'diesel pm',
 'pesticides',
 'traffic',
 'cleanup sites',
 'groundwater threats',
 'haz. waste',
 'imp. water bodies',
 'solid waste',
 'pollution burden',
 'low birth weight',
 'education',
 'linguistic isolation',
 'poverty',
 'pop. char. ',
 'drinking water',
 'tox. release',
 'unemployment',
 'ces_per',
 'cardiovascular disease',
 'housing burden',
 'est total',
 'est gen',
 'est cold',
 'est farm',
 'est other']]
y = enviro['asthma']

In [68]:
### drop possible target health columns as well

In [69]:
X = X.drop(columns = [ 
    # 'pop. char. ' 
    'cardiovascular disease', 
    'low birth weight'
    # 'total population',
    ])

In [70]:
## further cleanup

In [71]:
X=pd.get_dummies(X)
X=X.fillna(X.median())

#get rid of missing target columns
X = X[~y.isna()]
y = y[~y.isna()]

#### Split into train test

In [72]:
X_train, X_test, y_train,y_test = train_test_split(X,y,random_state= 42, test_size=.22)

In [73]:
## Scaling data

In [74]:
X_train=X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [75]:
xgreg = XGBRegressor().fit(X_train, y_train)

In [76]:
print(f'Train accuracy {xgreg.score(X_train,y_train)}')
print(f'Test accuracy {xgreg.score(X_test,y_test)}')

Train accuracy 0.9158999157626224
Test accuracy 0.7530615037770655


#### further refine with a Gridsearch and CV

In [77]:
xgbs = GridSearchCV(
            estimator=XGBRegressor(nthread=4),
            param_grid={"learning_rate":[0.10,0.15,0.20],
                        "max_depth": [ 3, 4, 5, 6, 8],
                        "min_child_weight": [ 1, 3, 5, 7],
                        "gamma":[ 0.0, 0.1, 0.2],
                        "colsample_bytree":[ 0.3, 0.4],
                       'nthread':[4]},
            cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=1)

In [78]:
outcome = xgbs.fit(X_train,y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


In [79]:
xgreg = XGBRegressor(**xgbs.best_params_).fit(X_train, y_train)

In [92]:
print(f'Train accuracy {xgreg.score(X_train,y_train)}')
print(f'Test accuracy {xgreg.score(X_test,y_test)}')

Train accuracy 0.9472151826696329
Test accuracy 0.7639090300436409


In [81]:
## RMSE score & r2_score metrics

In [93]:
preds = xgreg.predict(X_test)

In [96]:
#train_score= r2_score(y_train,preds)
#test_score= r2_score(y_test,X_test)

#### RMSE score

In [None]:
##### rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

#### 