> 4/26

#### `XGBOOST` 

In [9]:
import xgboost as xgb

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
path = '../processed_data/all_merged.csv'
df = pd.read_csv(path)

In [5]:
df.dropna(inplace=True)
df.shape

(14912, 63)

In [6]:
# FROM DAVID'S 6_NB: 
# cutting out percentiles , lat/lon, zip, census tract, "scores"

initial_columns_to_fit =[
 'total population',
 'ozone',
 'pm2.5',
 'diesel pm',
 'pesticides',
 'traffic',
 'cleanup sites',
 'groundwater threats',
 'haz. waste',
 'imp. water bodies',
 'solid waste',
 'pollution burden',
 'asthma',
 'low birth weight',
 'education',
 'linguistic isolation',
 'poverty',
 'pop. char. ',
 'drinking water',
 'tox. release',
 'unemployment',
 'cardiovascular disease',
 'housing burden',
 'est total',
 'est gen',
 'est cold',
 'est farm',
 'est other']

In [7]:
# make a dataframe with just the initial_columns_to_fit, and all NaNs filled with median.

y = df['asthma']
X = df[initial_columns_to_fit]

In [8]:
# dropping all health outcomes and strict counts in addition to percentiles 
X = X.drop(columns = [
    'asthma', 
    'pop. char. ', 
    'cardiovascular disease', 
    'low birth weight',
    'total population'
    ])
# X.head()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [19]:
xg_reg = xgb.XGBRegressor(
    # objective ='reg:linear', 
    objective = 'reg:squarederror',
    colsample_bytree = 0.3, 
    learning_rate = 0.1,
    max_depth = 5, 
    alpha = 10, 
    n_estimators = 10)

In [15]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [16]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 32.832868


In [20]:
### CV VALIDATIONS

params = {
    'objective' : 'reg:squarederror',
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 5, 
    'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, 
    params = params, 
    nfold = 3,
    num_boost_round = 50,
    early_stopping_rounds = 10,
    metrics = 'rmse', 
    as_pandas=True, 
    seed = 42)

In [22]:
# #cv_results contains train and test RMSE metrics for each boosting round.
# cv_results.head()

In [23]:
print((cv_results["test-rmse-mean"]).tail(1))

49    20.960534
Name: test-rmse-mean, dtype: float64


In [24]:
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(X_train, y_train)

gbr.predict(X_test)
print(gbr.score(X_test, y_test))

xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

0.4944453120772796


In [26]:
print(f'Train accuracy {gbr.score(X_train,y_train)}')
print(f'Test accuracy {gbr.score(X_test,y_test)}')

Train accuracy 0.5563905575162176
Test accuracy 0.4944453120772796


In [25]:
from sklearn.model_selection import GridSearchCV

import multiprocessing

print("Parallel Parameter optimization")

xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                        'n_estimators': [50, 100, 200]},
                         verbose=1,
                         n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

Parallel Parameter optimization
Fitting 5 folds for each of 9 candidates, totalling 45 fits
-0.08065429844418215
{'max_depth': 2, 'n_estimators': 100}


In [None]:
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)