In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Bike Rental Datasets

In [None]:
# import libararies 
import pandas as pd
import datetime as dt 
import zoneinfo
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, f1_score
import numpy as np
from xgboost import XGBRegressor

import warnings

warnings.filterwarnings('ignore')

In [None]:
df_bikes = pd.read_csv("data/bike_rentals.csv")
df_bikes.head()
df_bikes.describe()

In [None]:
df_bikes.info()

In [None]:
# finding number of null values 
df_bikes.isna().sum() # return cols name with null values 
df_bikes.isna().sum().sum()


In [None]:
# displaying null values 
df_bikes[df_bikes.isna().any(axis=1)] # .df_bikes.isna().any gathers any and all null values while (axis=1) specifies values in the columns.

In [None]:
df_bikes['windspeed'].fillna(df_bikes['windspeed'].median(), inplace=True)
# filling missing data with meidan is often better choice compared to mean, bcos median guarantees that half the
# data is greater than the given value and half that is lower. Mean on the other other hand is vulnerable to
# outliers

df_bikes[df_bikes.isna().any(axis=1)]

In [None]:
# group df by seanson with the median aggregate 
df_bikes.groupby(['season']).median()

In [None]:
# replaceing missing values in 'hum' column 
df_bikes['hum'] = df_bikes['hum'].fillna(df_bikes.groupby(['season'])['hum'].transform('median'))

In [None]:
df_bikes.iloc[[129, 213, 388]]

In [None]:
# finding null values in temp. 
df_bikes[df_bikes['temp'].isna()]

In [None]:
# getting mean of temp at instance 700 and 702
mean_temp = (df_bikes.iloc[702]['temp'] + df_bikes.iloc[702]['temp']) / 2
mean_atemp = (df_bikes.iloc[702]['atemp'] + df_bikes.iloc[702]['atemp']) / 2

# replace missing temp and atemp values 
df_bikes['temp'].fillna(mean_temp, inplace=True)
df_bikes['atemp'].fillna(mean_atemp, inplace=True)

df_bikes.iloc[[701]]

In [None]:
# converting 'dteday' to a datetime type 
df_bikes['dteday'] = pd.to_datetime(df_bikes['dteday'], infer_datetime_format=True)

# replacing in yr
df_bikes.loc[730, 'yr'] = 1.0 

We can now extrapolate dates for the null values using some different approaches. A standard approach is convert the 'mnth' column to the correct months extrapolated from the 'dteday' column. This has the advantage of correcting any additional errors that may have surfaced in conversions, assuming of course that the 'dteday' column is correct.



In [None]:
df_bikes['mnth'] = df_bikes['dteday'].dt.month
df_bikes.tail()

In [None]:
# droping non numerial objects
main_cols_save = df_bikes.columns.difference(['dteday','casual','registered'])
df_bikes[main_cols_save].to_csv('data/df_bikes_cleaned.csv', index=False)

In [None]:
main_cols = df_bikes.columns.difference(['dteday','casual','registered', 'cnt'])

In [None]:
X = df_bikes[main_cols]
y = df_bikes['cnt']
X, y

In [None]:
# splitting datasets 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
l_reg = LinearRegression()

l_reg.fit(X_train, y_train)

y_pred = l_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: %0.2f" %(rmse))

In [None]:
df_bikes['cnt'].describe()

In [None]:
xg_reg = XGBRegressor()
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE: %0.2f" %(rmse))

In [None]:
# cross validation with linear reg 
from sklearn.model_selection import cross_val_score

model = LinearRegression()

scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

rmse = np.sqrt(-scores)

print('Reg rmse:', np.round(rmse, 2))

print('RMSE mean: %0.2f' % (rmse.mean()))

## The Census

In [None]:
df_census = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)

df_census.head()

In [None]:
df_census.columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df_census.head()

In [None]:
df_census.info()

In [None]:
num_cols = df_census._get_numeric_data().columns.tolist()
cat_cols = [col for col in df_census.columns if col not in num_cols + ['education']]

num_cols
cat_cols

In [None]:
df_census = df_census.drop(['education'], axis=1)


In [None]:
df_census = pd.get_dummies(data = df_census, columns = cat_cols)
df_census.head()

In [None]:
main_cols_save = df_census.columns.difference(['education','income_ <=50K'])
df_census[main_cols_save].to_csv('data/df_census_cleaned.csv', index=False)

In [None]:
main_cols = df_census.columns.difference(['education','income_ <=50K', 'income_ >50K'])
main_cols

In [None]:
X = df_census[main_cols]
y = df_census['income_ >50K']

In [None]:
from sklearn.linear_model import LogisticRegression

def cross_val(classifier, num_splits=10):
    model = classifier
    scores = cross_val_score(model, X, y, cv=num_splits)
    print('Accuracy:', np.round(scores, 2))
    print('Accuarcy mean: %0.2f' %(scores.mean()))

cross_val(LogisticRegression())


In [None]:
from xgboost import XGBClassifier

cross_val(XGBClassifier(n_estimators=5))

In [None]:
# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
df_bikes = pd.read_csv('data/df_bikes_cleaned.csv')
main_cols = df_bikes.columns.difference(['dteday','casual','registered', 'cnt'])
X_bikes = df_bikes[main_cols]
y_bikes = df_bikes['cnt']

reg = DecisionTreeRegressor(random_state=2)
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)
rmse = np.sqrt(-scores)

print('RMSE mean: %0.2f' % (rmse.mean()))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

In [None]:
from sklearn.model_selection import GridSearchCV 

params = {'max_depth':[None,2,3,4,6,8,10,20]}

reg = DecisionTreeRegressor(random_state=2)

grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=2)

grid_reg.fit(X_train, y_train)

best_params = grid_reg.best_params_

print("Best params:", best_params)

best_score = np.sqrt(-grid_reg.best_score_)
print("Training score: {:.3f}".format(best_score))

best_model = grid_reg.best_estimator_

y_pred = best_model.predict(X_test)

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

print('Test score: {:.3f}'.format(rmse_test))

In [None]:
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):

    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

    grid_reg.fit(X_train, y_train)

    best_params = grid_reg.best_params_
    print("Best params:", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Training score: {:.3f}".format(best_score))

    y_pred = grid_reg.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5

    print('Test score: {:.3f}'.format(rmse_test))

In [None]:
grid_search(params={'min_samples_leaf':[1, 2, 4, 6, 8, 10, 20, 30]})

In [None]:
grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})

In [None]:
grid_search(params={'max_depth':[6,7,8,9,10],'min_samples_leaf':[3,5,7,9]})

## Heart disease dataset

In [None]:
df_heart = pd.read_csv('data/heart_disease.csv')
df_heart.head()


In [None]:
main_cols = df_heart.columns.difference(['target'])
X = df_heart[main_cols]
y = df_heart['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

model = DecisionTreeClassifier(random_state=2)
scores = cross_val_score(model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))


### RandomizedSearch CLF

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=3, random_state=2)
    rand_clf.fit(X_train, y_train)
    best_model = rand_clf.best_estimator_
    best_score = rand_clf.best_score_
    print("Training score: {:.3f}".format(best_score))
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Test score: {:.3f}'.format(accuracy))

    return best_model

params = {'criterion':['entropy', 'gini'], 'splitter': ['random', 'best'],
          'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01],
          'min_samples_split':[2, 3, 4, 5, 6, 8, 10], 'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
          'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
          'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
          'max_depth':[None, 2,4,6,8],
          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]}    

randomized_search_clf(params=params);

In [None]:
params = {'max_depth':[None, 6, 7],'max_features':['auto', 0.78],
          'max_leaf_nodes':[45, None], 'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
          'min_samples_split':[2, 9, 10],'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],}
model_ = randomized_search_clf(params= params, runs=100)

In [None]:
model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                               max_features=0.78, max_leaf_nodes=45, min_impurity_decrease=0.0,
                               min_impurity_split=None, min_samples_leaf=0.045, min_samples_split=9,
                               min_weight_fraction_leaf=0.06, random_state=2, splitter='best')

scores = cross_val_score(model_, X, y, cv=10)

print('Accuracy:', np.round(scores, 2))

print('Accuracy mean: %0.2f' % (scores.mean()))

### Feature importance

In [None]:
best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,max_features=0.8,
                                  max_leaf_nodes=47,min_impurity_decrease=0.0, min_impurity_split=None,
                                  min_samples_leaf=1, min_samples_split=8,min_weight_fraction_leaf=0.05,
                                  random_state=2, splitter='best')

best_clf.fit(X, y)

best_clf.feature_importances_

In [None]:
import operator

feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)


## Bagging with Random Forests

### Bootstrap aggreagation

In [None]:
df_census = pd.read_csv('data/census_cleaned.csv')

df_census.head()

In [None]:
main_cols_census = df_census.columns.difference(['income_ >50K'])
X_census = df_census[main_cols_census]
y_census = df_census['income_ >50K']

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=2, n_jobs=-1)
scores = cross_val_score(rf, X_census, y_census, cv=5)

print('Accuracy:', np.round(scores, 3))
print('Accuracy mean: %0.3f' % (scores.mean()))

In [None]:
df_bikes.head()

main_cols_bikes = df_bikes.columns.difference(['dteday','casual','registered', 'cnt'])
X_bikes = df_bikes[main_cols_bikes]
y_bikes = df_bikes['cnt']

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10, random_state=2, n_jobs=-1)
scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=10)

rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

In [None]:
### oob_Score 
rf = RandomForestClassifier(oob_score=True, n_estimators=10, random_state=2, n_jobs=-1)
rf.fit(X_census, y_census)

rf.oob_score_

In [None]:
rf = RandomForestClassifier(oob_score=True, n_estimators=50, random_state=2, n_jobs=-1)
rf.fit(X_census, y_census)

rf.oob_score_

In [None]:
rf = RandomForestClassifier(oob_score=True, n_estimators=100, random_state=2, n_jobs=-1)
rf.fit(X_census, y_census)

rf.oob_score_

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set()

oob_scores = []
rf = RandomForestClassifier(n_estimators=50, warm_start=True, oob_score=True, n_jobs=-1, random_state=2)
rf.fit(X_census, y_census)
oob_scores.append(rf.oob_score_)

est = 50
estimators = [est]
for i in range(9):
    est += 50
    estimators.append(est)
    rf.set_params(n_estimators=est)
    rf.fit(X_census, y_census)
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(15,7))
plt.plot(estimators, oob_scores)
plt.xlabel('Number of Trees')
plt.ylabel('oob_score_')
plt.title('Random Forest Warm Start', fontsize=15)
plt.savefig('Random_Forest_Warm_Start', dpi=325)
plt.show()

In [None]:
rf = RandomForestRegressor(n_estimators=50, warm_start=True, n_jobs=-1, random_state=2)
scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=10)
rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

In [None]:
# Fine-tunning hyperparameters

X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

def randomized_search_Reg(params, runs=16, reg=RandomForestRegressor(random_state=2, n_jobs=-1)):
    rand_reg = RandomizedSearchCV(reg, params, n_iter=runs, scoring='neg_mean_squared_error',
                                  cv=10, n_jobs=-1, random_state=2)
    rand_reg.fit(X_train, y_train)
    best_model = rand_reg.best_estimator_
    best_params = rand_reg.best_params_
    best_score = np.sqrt(-rand_reg.best_score_)
    print("Best params:", best_params)
    print("Training score: {:.3f}".format(best_score))
    
    y_pred = best_model.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5
    print('Test set score: {:.3f}'.format(rmse_test))

    return best_model
    
    
params={'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05],
        'min_samples_split':[2, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1],
        'min_samples_leaf':[1,2,4,6,8,10,20,30],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
        'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,2,4,6,8,10,20]}

randomized_search_Reg(params=params)

In [None]:
params={'min_samples_leaf': [1,2,4,6,8,10,20,30],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,2,4,6,8,10,20]}
randomized_search_Reg(params=params)

In [None]:
params={'min_samples_leaf':[1,2,4,6,8,10,20,30],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,4,6,8,10,12,15,20]}
randomized_search_Reg(params=params, runs=20)

In [None]:
params={'min_samples_leaf':[1,2,3,4,5,6],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.08, 0.10, 0.12, 0.15],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,8,10,12,14,16,18,20]}

randomized_search_Reg(params=params)

In [None]:
params={'min_samples_leaf':[1,2,4,6,8,10,20,30],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,4,6,8,10,12,15,20],'n_estimators':[100]}
randomized_search_Reg(params=params, runs=20)

In [None]:
rf = RandomForestRegressor(n_estimators=100,
                           min_impurity_decrease=0.1,
                           max_features=0.6, max_depth=12,
                           warm_start=True, n_jobs=-1,
                           random_state=2)

scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=10)

rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 3))

print('RMSE mean: %0.3f' % (rmse.mean()))

In [None]:
# shuffling data 
from sklearn.utils import shuffle
df_shuffle_bikes = shuffle(df_bikes, random_state=2)

X_shuffle_bikes = df_shuffle_bikes[main_cols_bikes]
y_shuffle_bikes = df_shuffle_bikes['cnt']

rf = RandomForestRegressor(n_estimators=100,
                           min_impurity_decrease=0.1,
                           max_features=0.6, max_depth=12,
                           warm_start=True, n_jobs=-1,
                           random_state=2)

scores = cross_val_score(rf, X_shuffle_bikes, y_shuffle_bikes, scoring='neg_mean_squared_error', cv=10)

rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 3))

print('RMSE mean: %0.3f' % (rmse.mean()))

## From Gradient Boosting to XGBoost

### Buiding Gradient Boost From Scratch 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)


tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_1.fit(X_train, y_train)
y_train_pred = tree_1.predict(X_train)  # predict with trainset instead of testset.
y2_train = y_train - y_train_pred # get residual 

tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_2.fit(X_train, y2_train) # use residual as target for next training
y2_train_pred = tree_2.predict(X_train)
y3_train = y2_train -y2_train_pred

tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_3.fit(X_train, y3_train) # repeat as done earlier 

y1_pred = tree_1.predict(X_test) # predit test variable 
y2_pred = tree_2.predict(X_test)
y3_pred = tree_3.predict(X_test)

y_pred = y1_pred + y2_pred + y3_pred # summing up predictions 
mean_squared_error(y_test, y_pred)**0.5


### Gradient boosting model using sklearn

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2,
                                learning_rate=1.0)

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

In [None]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, random_state=2,
                                learning_rate=1.0)

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5;

In [None]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2)

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

In [None]:
learning_rate_values = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]

for value in learning_rate_values:
    gbr = GradientBoostingRegressor(max_depth=2,   n_estimators=300,
                                    random_state=2, learning_rate=value)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print('Learning Rate:', value, ', Score:', rmse)

In [None]:
# Base Learner 
depths = [None, 1, 2, 3, 4]
for depth in depths:
    gbr = GradientBoostingRegressor(max_depth=depth,   n_estimators=300,
                                    random_state=2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print("Max Depth:", depth, ", Score:", rmse);

In [None]:
# subsample 
samples = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.2]
for sample in samples:
    gbr = GradientBoostingRegressor(max_depth=3,   n_estimators=300, subsample=sample, random_state=2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print("Sample:", sample, ", Score:", rmse);
    

In [None]:
params = {'subsample': [0.5, 1.0, 0.0, 0.65, 0.70, 0.75, 0.02, 0.01],
          'n_estimators': [300, 500, 1000, 1500, 1800, 2000],
          'learning_rate': [0.05, 0.075, 0.1]}

gbr = GradientBoostingRegressor(max_depth=3, random_state=2)

rand_Reg = RandomizedSearchCV(gbr, params, n_iter=10,
                              scoring='neg_mean_squared_error', cv=5,
                              n_jobs=-1, random_state=2)
rand_Reg.fit(X_train, y_train)
best_model = rand_Reg.best_estimator_
best_params = rand_Reg.best_params_
best_score = np.sqrt(-rand_Reg.best_score_)

print("Best params:", best_params)
print("Training score: {:.3f}".format(best_score))

y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print('Test set score: {:.3f}'.format(rmse_test))



In [None]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, subsample=0.75, learning_rate=0.02, random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

In [None]:
# Xgboost
xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, subsample=0.75, random_state=2)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

## Big Data; XGBOOST

In [None]:
df = pd.read_csv('data/exoplanets.csv')
df.head();

In [None]:
df.info()

# checking for null values 
df.isnull().sum().sum()

In [None]:
main_cols = df.columns.difference(['LABEL'])

X = df[main_cols]
y = df['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [None]:
sns.countplot(y)
plt.title('Target Distribution', fontdict={'size':14})

y.value_counts()

In [None]:
# grad boost classifer 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import time

In [None]:
%%timeit
df.info()

In [None]:
%%time
gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))

## XGBoost Unveiling 

### Building XGBoost Models


In [None]:
# loading Iris dataset 
from sklearn import datasets

iris = datasets.load_iris()

df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])  # np.c_ is concatenating two numpy arrays 
df.head()


# preparing datasets for ml. 
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=2)

xgb = XGBClassifier(booster='gbtree', objective='multi:softprob',
                    max_depth=6, learning_rate=0.1,
                    n_estimators=100, random_state=2,
                    n_jobs=-1)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

accuracy_score(y_pred, y_test)


In [None]:
# Diabetes dataset 
X, y = datasets.load_diabetes(return_X_y=True)
xgb = XGBRegressor(booster="gbtree", objective="reg:squarederror",
                   max_depth=6, learning_rate=0.1,
                   n_estimators=100, random_state=2,
                   n_jobs=-1)

scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=10)

rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

pd.DataFrame(y).describe()
# xgboost regressor template (cross-validation)

### Higgs Challenge

In [None]:
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000,
                 compression='gzip')

df.head()

In [None]:
del df['Weight']
del df['KaggleSet']

df = df.rename(columns={'KaggleWeight': 'Weight'})
df.head()

In [None]:
label_col = df['Label']
del df['Label']
df['Label'] = label_col

df.head()

In [None]:
df.info()

In [None]:
# replacing target variable with 0 and 1
df['Label'].replace(('s','b'), (1, 0), inplace=True)


In [None]:
main_cols = df.columns.difference(['EventId', 'Label', 'Weight'])
target_cols = ['Label']

X = df[main_cols].squeeze()
y = df[target_cols].squeeze();

# X = df.iloc[:,1:31]
# y = df.iloc[:,-1];

In [None]:
type(df.iloc[:,1:31]), type( df[main_cols].squeeze())
type(df[target_cols].squeeze()), type(df.iloc[:,-1])

In [None]:
# The weights should first be scaled to match the test data since the test data provides the expected number of signal 
# and background events generated by the test set. The test data has 550,000 rows, more than twice the 250,000 rows (len(y))
# provided by the training data. Scaling weights to match the test data can be achieved by multiplying the weight column by
# the percentage of increase, as follows:

df['test_Weight'] = df['Weight'] * 550000 / len(y)

Next, XGBoost provides a hyperparameter, scale_pos_weight, which takes the scaling factor into account. The scaling factor is the sum of the weights of the background noises divided by the sum of the weight of the signal. The scaling factor can be computed using pandas conditional notation, as follows:

In [None]:
s = np.sum(df[df['Label'] == 1]['test_Weight'])
b = np.sum(df[df['Label'] == 0]['test_Weight']);

scale_pos_weight = b/s


In [None]:
from typing import Tuple
import xgboost as xgb

def f1_eval(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y = dtrain.get_label()

    # convert the predicted values from {predt E R | 0<predt<1} to {0, 1} with a threshold of 0.5
    # all values less than 0.5 would be converted to 0 (False) and
    # all values equal or greater than 0.5 would be converted to 1 (True)
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", f1_score(y_true=y, y_pred=predt_binary)

In [None]:
import xgboost as xgb

xgb_clf = xgb.DMatrix(X, y, missing=-999.0, weight=df['test_Weight'])

param = {'objective':'binary:logitraw',
         'scale_pos_weight': b/s,
         'eta': 0.1,
         'max_depth': 6,
         'eval_metric': 'auc'}

# list of params that include the preceeding items, along with the evaluation metric (auc) and ams@0.15
plst = list(param.items()) + [('eval_metric', 'ams@0.15')]
# create watchlist that include the intializaition  classifier and 'train' so that you can veiw scores as tree 
# cont. to boost 
watchlist = [(xgb_clf, 'train')]

# num of boosting rounds 
num_round = 120

print('loading data end, start to boost tree')
bst = xgb.train(plst, xgb_clf, num_round, watchlist, feval=f1_eval)

# save model 
bst.save_model('higgs.model')
print('finish training');


## XGBoody Hyperparameters

In [None]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV


from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split

In [None]:
# heart disease dataset
df = pd.read_csv('data/heart_disease.csv')
df.head()
df.info()

In [None]:
main_cols_heart = df.columns.difference(['target'])
X = df[main_cols_heart].squeeze()
y = df['target'].squeeze()

model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2, use_label_encoder =False)
scores = cross_val_score(model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))


### StratifiedKfold

In [None]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

# baseline model 
scores = cross_val_score(model, X, y, cv=kfold)
print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))


In [None]:
# combining gridsearch cv and randomizedsearch cv 

def grid_search(params: dict, random: bool =False) -> dict:
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2, use_label_encoder =False)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2) 
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=2)
        
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
    
    grid.fit(X, y)
    best_params = grid.best_params_
    print("Best params:", best_params)
    best_score = grid.best_score_
    print("Best score: {:.3f}".format(best_score))
    return best_params

In [None]:
grid_search(params={'n_estimators':[100, 200, 400, 800]})

grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})

grid_search(params={'max_depth':[2, 3, 5, 6, 8]})

grid_search(params={'gamma':[0, 0.01, 0.1, 0.5, 1, 2]})

grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})

grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})

grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})


### Early stopping

Early stopping is a general method to limit the number of training rounds in iterative machine learning algorithms. In this section, we look at eval_set, eval_metric, and early_stopping_rounds to apply early stopping.

Early stopping provides a limit to the number of rounds that iterative machine learning algorithms train on. Instead of predefining the number of training rounds, early stopping allows training to continue until n consecutive rounds fail to produce any gains, where n is a number decided by the user.


#### eval_set and eval_metric

early_stopping_rounds is not a hyperparameter, but a strategy for optimizing the n_estimators hyperparameter.
Normally when choosing hyperparameters, a test score is given after all boosting rounds are complete. To use early stopping, we need a test score after each round.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

model = XGBClassifier(booster='gbtree', objective='binary:logistic',
                      radom_state=2, verbosity = 0,  use_label_encoder =False)

eval_set = [(X_test, y_test)]

eval_metric = 'error'

model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# early stopping rounds
model = XGBClassifier(booster='gbtree', objective='binary:logistic',
                      random_state=2, use_label_encoder =False)
eval_set = [(X_test, y_test)]
eval_metric = 'error'

model.fit(X_train, y_train, eval_metric='error', eval_set=eval_set,
          early_stopping_rounds=10, verbose=True)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [None]:
model = XGBClassifier(random_state=2, n_estimators=5000, use_label_encoder=False)

eval_set = [(X_test, y_test)]

eval_metric="error"

model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
grid_search(params={'n_estimators':[2, 25, 50, 75, 100]});

In [None]:
params = {'max_depth':[1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators': [25]}
grid_search(params=params);

In [None]:
params = {'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],'max_depth':[1], 'n_estimators': [25]}
grid_search(params=params);

In [None]:
params = {'min_child_weight':[1],'learning_rate': [0.5],'max_depth':[1], 'n_estimators': [25]}
grid_search(params=params);

In [None]:
params = {'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 'min_child_weight':[1],
          'learning_rate': [0.5],'max_depth':[1], 'n_estimators': [25]}
grid_search(params=params);

In [None]:
params_ =  {'learning_rate': 0.5, 'max_depth': 1, 'min_child_weight': 1,
            'n_estimators': 25, 'subsample': 0.9}

In [None]:
params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'min_child_weight':[1, 2, 3, 4, 5],
                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5],
                    'max_depth':[1, 2, 3, 4, 5],
                    'n_estimators':[2]}
grid_search(params);

In [None]:
# Hyperparameter adjustments

params = {'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'min_child_weight':[1, 2, 3, 4, 5],
                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5],
                    'max_depth':[1, 2, 3, 4, 5, None],
                    'n_estimators':[2, 25, 50, 75, 100]}
grid_search(params=params, random=True)

In [None]:
# Colsample 
params={'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 'max_depth':[1], 'n_estimators':[50]}
grid_search(params=params)

In [None]:
# colsample_bylevel
params={'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1],'max_depth':[1], 'n_estimators':[50]}
grid_search(params=params)

In [None]:
# gamma
params={'gamma':[0, 0.01, 0.05, 0.1, 0.5, 1, 2, 3], 
        'colsample_bylevel':[0.9], 'colsample_bytree':[0.8], 
        'colsample_bynode':[0.5], 'max_depth':[1], 'n_estimators':[25]}
grid_search(params=params)

## Discovering Expplanets with XGBoost

In [None]:
df = pd.read_csv('data/exoplanets.csv', nrows=400)
df.head()

df['LABEL'].value_counts()


In [None]:
target_col = 'LABEL'
main_cols_expo = df.columns.difference([target_col])

X = df[main_cols_expo].squeeze()
y = df[target_col].squeeze()

In [None]:
def light_plot(index):
    y_vals = X.iloc[index]
    x_vals = np.arange(len(y_vals))
    plt.figure(figsize=(15,8))
    plt.xlabel('Number of Observations')
    plt.ylabel('Light Flux')
    plt.title('Light Plot ' + str(index), size=15)
    plt.plot(x_vals, y_vals)
    plt.show()

light_plot(0)
light_plot(1)
light_plot(37)

In [None]:
# data prep. 
df.info()
df.isnull().sum().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [None]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2, use_label_encoder=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, recall_score

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
recall_score(y_test, y_pred, pos_label=2)

### Resampling imbalanced data

In [None]:
def xgb_clf(model, nrows):
    df = pd.read_csv('data/exoplanets.csv', nrows=nrows)
    X = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = recall_score(y_test, y_pred, pos_label=2)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return score

xgb_clf(XGBClassifier(random_state=2), nrows=1000)


In [None]:
xgb_clf(XGBClassifier(random_state=2), nrows=200)

xgb_clf(XGBClassifier(random_state=2), nrows=74)

In [None]:
## oversampling
df_train = pd.merge(y_train, X_train, left_index=True, right_index=True)
df_train.head()

In [None]:
new_df=pd.DataFrame(np.repeat(df_train[df_train['LABEL'] == 2].values, 9, axis=0))
new_df.columns = df_train.columns

df_train_resample = pd.concat([df_train, new_df])
df_train_resample['LABEL'].value_counts()

In [None]:
X_train_resample = df_train_resample.iloc[:,1:]
y_train_resample = df_train_resample.iloc[:,0]

model = XGBClassifier(random_state=2)
model.fit(X_train_resample, y_train_resample)

y_pred = model.predict(X_test)
score = recall_score(y_test, y_pred, pos_label=2)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(score)

In [None]:
### Tuning and scaling XGBClassifier

### Adjusting weights

df['LABEL'] = df['LABEL'].replace(1, 0)  # re-assign 1 to 0
df['LABEL'] = df['LABEL'].replace(2, 1)  # re-assign 2 to 1

df['LABEL'].value_counts()


In [None]:
# scale_pos_weight
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

model = XGBClassifier(scale_pos_weight=10, random_state=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = recall_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(score)

The oversampling method that we implemented from scratch gives the same predictions as XGBClassifier with scale_pos_weight.


### Tuning XGBClassifier

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score

kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=2)
model = XGBClassifier(scale_pos_weight=10, random_state=2)

scores = cross_val_score(model, X, y, cv=kfold, scoring='recall')
print('Recall: ', scores)
print('Recall mean: ', scores.mean())


In [None]:
def grid_search_recall(params: dict, random: bool =False, X=X, y=y, 
                       model=XGBClassifier(booster='gbtree',
                                           objective='binary:logistic', 
                                           random_state=2, use_label_encoder =False),
                       kfold=kfold) -> dict:
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2, use_label_encoder =False)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2) 
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=2, scoring='recall')
        
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1, scoring='recall')
    
    grid.fit(X, y)
    best_params = grid.best_params_
    print("Best params:", best_params)
    best_score = grid.best_score_
    print("Best score: {:.3f}".format(best_score))
    return best_params

In [None]:
grid_search_recall(params={'n_estimators':[50, 200, 400, 800]})
grid_search_recall(params={'learning_rate':[0.01, 0.05, 0.2, 0.3]})
grid_search_recall(params={'max_depth':[1, 2, 4, 8]})
grid_search_recall(params={'subsample':[0.3, 0.5, 0.7, 0.9]})
grid_search_recall(params={'gamma':[0.05, 0.1, 0.5, 1]})

grid_search_recall(params={'learning_rate':[0.001, 0.01, 0.03], 'max_depth':[1, 2], 'gamma':[0.025, 0.05, 0.5]})

In [None]:
grid_search_recall(params={'max_delta_step':[1, 3, 5, 7]})

In [None]:
grid_search_recall(params={'subsample':[0.3, 0.5, 0.7, 0.9, 1],
'colsample_bylevel':[0.3, 0.5, 0.7, 0.9, 1],
'colsample_bynode':[0.3, 0.5, 0.7, 0.9, 1],
'colsample_bytree':[0.3, 0.5, 0.7, 0.9, 1]}, random=True);

### Fine-tuning all data

In [None]:
df_all = pd.read_csv('data/exoplanets.csv')
df_all.head()

In [None]:
# replacing 1s with 0s and 2s with 1s
df_all['LABEL'] = df_all['LABEL'].replace({1:0, 2:1})


In [None]:
X_all = df_all.iloc[:, 1:]
y_all = df_all.iloc[:, 0]

df_all['LABEL'].value_counts()

In [None]:
weight = int(df_all['LABEL'].value_counts()[0]/df_all['LABEL'].value_counts()[1])

model = XGBClassifier(scale_pos_weight=weight, random_state=2, use_label_encoder=False)
scores = cross_val_score(model, X_all, y_all, cv=kfold, scoring='recall')

print('Recall:', scores)
print('Recall mean:', scores.mean())

In [None]:
grid_search_recall(params={'learning_rate': [0.001, 0.01]},
                   X=X_all, y=y_all,
                   model=XGBClassifier(scale_pos_weight=weight, random_state=2))


In [None]:
grid_search_recall(params={'max_depth':[1, 2],'learning_rate': [0.001]},
                   X=X_all, y=y_all,
                   model=XGBClassifier(scale_pos_weight=weight, random_state=2))


In [None]:
def final_model(X, y, model):
    model.fit(X, y)
    y_pred = model.predict(X_all)
    score = recall_score(y_all, y_pred,)
    print(score)
    print(confusion_matrix(y_all, y_pred,))
    print(classification_report(y_all, y_pred))
    

In [None]:
# final_model(X_short, y_short, XGBClassifier(max_depth=2, colsample_by_node=0.5, random_state=2))
final_model(X, y, XGBClassifier(max_depth=2, colsample_bynode=0.5, scale_pos_weight=10, random_state=2))

final_model(X_all, y_all, XGBClassifier(max_depth=2, colsample_bynode=0.5, scale_pos_weight=weight, random_state=2))

### Advanced XGBoost

In [None]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn import datasets

from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

#### Applying gblinear via Diabetes datasets

In [None]:
X, y = datasets.load_diabetes(return_X_y=True)

kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [None]:
def regression_model(model, X=X, y=y, kfold=kfold):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    rmse = (-scores)**0.5
    print(f"{model.__class__.__name__} has RMSE: {rmse.mean()} ")
    return rmse.mean()



In [None]:
regression_model(XGBRegressor(booster='gblinear'))
regression_model(LinearRegression())
regression_model(Lasso())
regression_model(Ridge())
regression_model(XGBRegressor(booster='gbtree'))


In [None]:
# gblinear hpyerparameters

def grid_search_gb(params, X=X, y=y, reg=XGBRegressor(booster='gblinear'),kfold=kfold):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X, y)
    best_params = grid_reg.best_params_
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Best params:", best_params)
    print("Best score:", best_score)

In [None]:
grid_search_gb(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})
grid_search_gb(params={'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]})

In [None]:
grid_search_gb(params={'feature_selector':['shuffle']})
grid_search_gb(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })
grid_search_gb(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[3, 5, 7, 9]})

In [None]:
# Linear datasets 
X = np.arange(1,100)
np.random.seed(2) 
y = []
for i in X:
    y.append(i*np.random.uniform(-0.2, 0.2))
y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [None]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

regression_model(LinearRegression())

In [None]:
X, y = datasets.load_diabetes(return_X_y=True)
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))

In [None]:
# trying dart on bigger dataset 
df_census = pd.read_csv('data/census_cleaned.csv')

X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]


In [None]:
def classification_model(model, X=X, y=y,kfold=kfold):
    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)
    print(f"{model.__class__.__name__} has RMSE: {scores.mean()} ")
    return scores.mean()

In [None]:
classification_model(XGBClassifier(booster='gbtree', use_label_encoder=False))

In [None]:
classification_model(XGBClassifier(booster='dart', use_label_encoder=False))

In [None]:
classification_model(XGBClassifier(booster='gblinear', use_label_encoder=False))

In [None]:
# DART hyperparameters
classification_model(XGBClassifier(booster='dart', one_drop=1, use_label_encoder=False))

In [None]:
classification_model(XGBClassifier(booster='dart', objective='reg:squarederror',
                                   sample_type='weighted', use_label_encoder=False))

In [None]:
classification_model(XGBClassifier(booster='dart', objective='reg:squarederror',
                                   normalize_type='forest', use_label_encoder=False))

In [None]:
classification_model(XGBClassifier(booster='dart', objective='reg:squarederror',
                                   one_drop=1, use_label_encoder=False))

In [None]:
grid_search_gb(params={'rate_drop':[0.01, 0.1, 0.2, 0.4]},
               reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

grid_search_gb(params={'skip_drop':[0.01, 0.1, 0.2, 0.4]},
               reg=XGBRegressor(booster='dart', objective='reg:squarederror'))

In [None]:
# XGBoost random forest 
# There are two strategies to implement random forests within XGBoost. The first is to use random 
# forests as the base learner, the second is to use XGBoost's original random forests, XGBRFRegressor and XGBRFClassifier.
# We start with our original theme, random forests as alternative base learners.

# RF as base learners. Option I
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=25))
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=5))



In [None]:
# RF as xgboost models. Option II
regression_model(XGBRFRegressor(objective='reg:squarederror'))
regression_model(RandomForestRegressor())

In [None]:
classification_model(XGBRFClassifier(use_label_encoder=False))
classification_model(RandomForestClassifier())

### Kaggle Masters

Here is a general approach for validating and testing machine learning models on your own:

* Split data into a training set and a hold-out set: Keep the hold-out set away and resist the temptation to look at it.

* Split the training set into a training and test set or use cross-validation: Fit new models on the training set and validate the model, going back and forth to improve scores.

* After obtaining a final model, test it on the hold-out set: This is the real test of the model. If the score is below expectations, return to step 2 and repeat. Do not—and this is important—use the hold-out set as the new validation set, going back and forth adjusting hyperparameters. When this happens, the model is adjusting itself to match the hold-out set, which defeats the purpose of a hold-out set in the first place.


#### Engineering new columns

Feature engineering is the process of developing new columns of data from the original columns. The question is not whether you should implement feature engineering, but how much feature engineering you should implement.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import datetime as dt
from category_encoders import target_encoder


import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/cab_rides.csv', nrows=10000)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   distance          10000 non-null  float64
 1   cab_type          10000 non-null  object 
 2   time_stamp        10000 non-null  int64  
 3   destination       10000 non-null  object 
 4   source            10000 non-null  object 
 5   price             9227 non-null   float64
 6   surge_multiplier  10000 non-null  float64
 7   id                10000 non-null  object 
 8   product_id        10000 non-null  object 
 9   name              10000 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 781.4+ KB


In [4]:
df[df.isna().any(axis=1)]

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
18,1.11,Uber,1543673584211,West End,North End,,1.0,fa5fb705-03a0-4eb9-82d9-7fe80872f754,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
31,2.48,Uber,1543794776318,South Station,Beacon Hill,,1.0,eee70d94-6706-4b95-a8ce-0e34f0fa8f37,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
40,2.94,Uber,1543523885298,Fenway,North Station,,1.0,7f47ff53-7cf2-4a6a-8049-83c90e042593,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
60,1.16,Uber,1544731816318,West End,North End,,1.0,43abdbe4-ab9e-4f39-afdc-31cfa375dc25,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
69,2.67,Uber,1543583283653,Beacon Hill,North End,,1.0,80db1c49-9d51-4575-a4f4-1ec23b4d3e31,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
...,...,...,...,...,...,...,...,...,...,...
9949,1.08,Uber,1543272429665,North End,North Station,,1.0,74fffcba-da67-42d1-b585-13d546a125be,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9953,2.46,Uber,1545045010035,Beacon Hill,Fenway,,1.0,18c2e91d-d594-4a22-9be7-0a5829efa4bf,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9965,2.58,Uber,1544815809335,Beacon Hill,South Station,,1.0,77adadfb-4ac7-4cdf-aeab-6c4cfe8f7b26,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9985,1.89,Uber,1544695512211,Beacon Hill,North End,,1.0,f2dfa974-f9d1-4e90-a0e6-77f7eea16956,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi


In [5]:
df.dropna(inplace=True)

df.isna().sum().sum()

0

In [6]:
df['date'] = pd.to_datetime(df['time_stamp']*(10**6))

df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223


In [7]:
df['month'] = df['date'].dt.month
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek

df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890,12,9,6
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677,11,2,1
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198,11,1,2
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749,11,4,4
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223,11,3,3


In [8]:
def weekend(row):
    if row['dayofweek'] in [5, 6]:
        return 1
    return 0

def rush_hour(row):
    if (row['hour'] in [6,7,8,9,15,16,17,18]) & (row['weekend'] == 0):
        return 1
    return 0


df['weekend'] = df.apply(weekend, axis=1)
df['rush_hour'] = df.apply(rush_hour, axis=1)
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890,12,9,6,1
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677,11,2,1,0
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198,11,1,2,0
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749,11,4,4,0
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223,11,3,3,0


In [10]:
df.cab_type.value_counts()

Uber    4654
Lyft    4573
Name: cab_type, dtype: int64

In [11]:
# grouping by cab_type 
df['cab_freq'] = df.groupby('cab_type')['cab_type'].transform('count')
df['cab_freq'] = df['cab_freq']/len(df)

encoder = target_encoder.TargetEncoder()
df['cab_type_mean'] = encoder.fit_transform(df['cab_type'], df['price'])
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour,cab_freq
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1,0.504389
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0,0.504389
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0,0.504389
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0,0.504389
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0,0.495611


In [23]:
# Range of models
from sklearn.datasets import load_breast_cancer

def classification_model(model):
    scores = cross_val_score(model, X, y, cv=kfold)
    return scores.mean()

X, y = load_breast_cancer(return_X_y=True)
kfold = StratifiedKFold(n_splits=5)

classification_model(XGBClassifier(verbose=0, silent=True))
classification_model(XGBClassifier(booster='gblinear', silent=True))
classification_model(XGBClassifier(booster='dart', one_drop=True, silent=True))
classification_model(RandomForestClassifier(random_state=2))
classification_model(LogisticRegression(max_iter=10000))
classification_model(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1, silent=True))

Parameters: { "silent", "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent", "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent", "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent", "verbose" } might not be used.

  This may not be accurate due to some parameters are only 

0.9771619313771154

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

0.9631268436578171

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

0.9683744760130415

0.9666356155876418

0.9507995652848935

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

0.9701133364384411

In [25]:
# Correlation in ML enembles

def y_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    print(score)
    return y_pred

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
y_pred_gbtree = y_pred(XGBClassifier())

0.951048951048951


In [28]:
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))
y_pred_forest = y_pred(RandomForestClassifier())
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))
y_pred_xgb = y_pred(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.951048951048951
0.9370629370629371
0.9370629370629371
0.965034965034965


In [29]:
df_pred = pd.DataFrame(data= np.c_[y_pred_gbtree, y_pred_dart, y_pred_forest, y_pred_logistic, y_pred_xgb],
                       columns=['gbtree', 'dart','forest', 'logistic', 'xgb'])
df_pred.corr()

Unnamed: 0,gbtree,dart,forest,logistic,xgb
gbtree,1.0,0.971146,0.913384,0.914111,0.971146
dart,0.971146,1.0,0.942396,0.914111,0.971146
forest,0.913384,0.942396,1.0,0.941715,0.913384
logistic,0.914111,0.914111,0.941715,1.0,0.914111
xgb,0.971146,0.971146,0.913384,0.914111,1.0


In [31]:
# The VotingClassifier ensemble

estimators = []

logistic_model = LogisticRegression(max_iter=10000)
estimators.append(('logistic', logistic_model))

xgb_model = XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)
estimators.append(('xgb', xgb_model))

rf_model = RandomForestClassifier(random_state=2)
estimators.append(('rf', rf_model))

ensemble = VotingClassifier(estimators)

scores = cross_val_score(ensemble, X, y, cv=kfold)
print(scores.mean())

0.9771619313771154


In [32]:
# stacking using sklearn
base_models = []

base_models.append(('lr', LogisticRegression()))
base_models.append(('xgb', XGBClassifier()))
base_models.append(('rf', RandomForestClassifier(random_state=2)))

meta_model = LogisticRegression()

clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)
scores = cross_val_score(clf, X, y, cv=kfold)
print(scores.mean())

0.9789318428815401


### XGBoost Model Deployment

In [30]:
import pandas as pd 
import warnings
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as MSE
from xgboost import XGBRegressor

from scipy.sparse import csr_matrix
from scipy.sparse import hstack

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/student-por.csv', sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [3]:
df.info()
df.isnull().sum().sum()

df[df.isna().any(axis=1)]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   school      649 non-null    object 
 1   sex         648 non-null    object 
 2   age         648 non-null    float64
 3   address     649 non-null    object 
 4   famsize     649 non-null    object 
 5   Pstatus     649 non-null    object 
 6   Medu        649 non-null    int64  
 7   Fedu        649 non-null    int64  
 8   Mjob        649 non-null    object 
 9   Fjob        649 non-null    object 
 10  reason      649 non-null    object 
 11  guardian    648 non-null    object 
 12  traveltime  649 non-null    int64  
 13  studytime   649 non-null    int64  
 14  failures    649 non-null    int64  
 15  schoolsup   649 non-null    object 
 16  famsup      649 non-null    object 
 17  paid        649 non-null    object 
 18  activities  649 non-null    object 
 19  nursery     649 non-null    o

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11


In [4]:
# filling na's
df['age'].fillna(-999.0)
df['sex'] = df['sex'].fillna(df['sex'].mode())
df['guardian'] = df['guardian'].fillna(df['guardian'].mode())

df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [5]:
# one hot encoding 
cat_cols = df.columns[df.dtypes == object].tolist()
num_cols = df.columns.difference(cat_cols).tolist()

ohe = OneHotEncoder()
hot = ohe.fit_transform(df[cat_cols])
hot_df = pd.DataFrame(hot.toarray())

hot_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [6]:
cold_df = df.select_dtypes(exclude=['object'])
cold_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18.0,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,15.0,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,15.0,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,16.0,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


In [7]:
cold_df[num_cols].head()

Unnamed: 0,Dalc,Fedu,G1,G2,G3,Medu,Walc,absences,age,failures,famrel,freetime,goout,health,studytime,traveltime
0,1,4,0,11,11,4,1,4,18.0,0,4,3,4,3,2,2
1,1,1,9,11,11,1,1,2,,0,5,3,3,3,2,1
2,2,1,12,13,12,1,3,6,15.0,0,4,3,2,3,2,1
3,1,2,14,14,14,4,1,0,15.0,0,3,2,2,5,3,1
4,1,3,11,13,13,3,2,0,16.0,0,4,3,2,5,2,1


In [8]:
cold = csr_matrix(cold_df)
final_sparse_matrix = hstack((hot, cold))

final_df = pd.DataFrame(final_sparse_matrix.toarray())
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


In [9]:
# custom transformers 
class NullValueImputer(TransformerMixin):
    def __init__(self) -> None:
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                X[column] = X[column].fillna(X[column].mode())
            else:
                X[column]=X[column].fillna(-999.0)
        return X

class SparseMatrix(TransformerMixin):
    def __init__(self) -> None:
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        cat_cols = X.columns[X.dtypes==object].tolist()
        hot = OneHotEncoder().fit_transform(X[cat_cols])
        cold = csr_matrix(X.select_dtypes(exclude=["object"]))
        final_sparse_matrix = hstack((hot, cold))
        final_csr_matrix = final_sparse_matrix.tocsr()
        final_df = pd.DataFrame(final_csr_matrix.toarray())
        return final_df


In [10]:
df = pd.read_csv('data/student-por.csv', sep=';')
nvi = NullValueImputer().fit_transform(df)
nvi.head()

sm = SparseMatrix().fit_transform(df)
sm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


In [11]:
df = pd.read_csv('data/student-por.csv', sep=';')

y = df.iloc[:, -1]
X = df.iloc[:, :-3]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

data_pipeline = Pipeline([('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train)

In [12]:
y_train.value_counts()

11    82
10    75
13    58
12    53
14    42
15    36
9     29
16    27
8     26
17    24
18    14
0     10
7      7
5      1
6      1
19     1
Name: G3, dtype: int64

In [13]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

def cross_val(model):
    scores = cross_val_score(model, X_train_transformed, y_train, scoring='neg_root_mean_squared_error', cv=kfold)
    rmse = (-scores.mean())
    return rmse

cross_val(XGBRegressor(missing=-999.0))

2.9004041754792746

In [17]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_transformed, y_train, random_state=2)
def n_estimators(model):
    eval_set = [(X_test_2, y_test_2)]
    eval_metric="rmse"
    model.fit(X_train_2, y_train_2, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)
    y_pred = model.predict(X_test_2)
    rmse = MSE(y_test_2, y_pred)**0.5
    return rmse

n_estimators(XGBRegressor(n_estimators=5000, missing=-999.0))

[0]	validation_0-rmse:8.49176
[1]	validation_0-rmse:6.31389
[2]	validation_0-rmse:4.97965
[3]	validation_0-rmse:4.16109
[4]	validation_0-rmse:3.67782
[5]	validation_0-rmse:3.42779
[6]	validation_0-rmse:3.30579
[7]	validation_0-rmse:3.25238
[8]	validation_0-rmse:3.22878
[9]	validation_0-rmse:3.20020
[10]	validation_0-rmse:3.17934
[11]	validation_0-rmse:3.16766
[12]	validation_0-rmse:3.15062
[13]	validation_0-rmse:3.13508
[14]	validation_0-rmse:3.14204
[15]	validation_0-rmse:3.13769
[16]	validation_0-rmse:3.15551
[17]	validation_0-rmse:3.15064
[18]	validation_0-rmse:3.14732
[19]	validation_0-rmse:3.14887
[20]	validation_0-rmse:3.14607
[21]	validation_0-rmse:3.14591
[22]	validation_0-rmse:3.14349
[23]	validation_0-rmse:3.14303
[24]	validation_0-rmse:3.14024
[25]	validation_0-rmse:3.14376
[26]	validation_0-rmse:3.14765
[27]	validation_0-rmse:3.14520
[28]	validation_0-rmse:3.13969
[29]	validation_0-rmse:3.14365
[30]	validation_0-rmse:3.13755
[31]	validation_0-rmse:3.14122
[32]	validation_0-

3.125373597402936

In [19]:
def grid_search(params, reg=XGBRegressor(missing=-999.0)):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X_train_transformed, y_train)
    best_params = grid_reg.best_params_
    print("Best params:", best_params)
    best_score = (-grid_reg.best_score_) ** 0.5
    print("Best score:", best_score)

grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8],
                    'n_estimators':[31]})

Best params: {'max_depth': 1, 'n_estimators': 31}
Best score: 2.6632597695120817


In [20]:
grid_search(params={'max_depth':[1, 2, 3],
                    'min_child_weight':[1,2,3,4,5],
                    'n_estimators':[31]})

Best params: {'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 31}
Best score: 2.6632597695120817


In [21]:
grid_search(params={'max_depth':[2],
                    'min_child_weight':[2,3],
                    'subsample':[0.5, 0.6, 0.7, 0.8, 0.9],
                    'n_estimators':[31, 50]})

Best params: {'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 31, 'subsample': 0.8}
Best score: 2.680514895351618


In [22]:
grid_search(params={'max_depth':[1],
                    'min_child_weight':[3],
                    'subsample':[.8],
                    'colsample_bytree':[0.9],
                    'colsample_bylevel':[0.6, 0.7, 0.8, 0.9, 1],
                    'colsample_bynode':[0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators':[50]})

Best params: {'colsample_bylevel': 0.7, 'colsample_bynode': 0.7, 'colsample_bytree': 0.9, 'max_depth': 1, 'min_child_weight': 3, 'n_estimators': 50, 'subsample': 0.8}
Best score: 2.6475633466646373


In [23]:
X_test_transformed = data_pipeline.fit_transform(X_test)
model = XGBRegressor(max_depth=2, min_child_weight=3, subsample=0.9, colsample_bytree=0.8, gamma=2, missing=-999.0)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
rmse = MSE(y_pred, y_test)**0.5
rmse


2.907554963248978

In [24]:
model = XGBRegressor(max_depth=1,
                       min_child_weight=5,
                       subsample=0.6,
                       colsample_bytree=0.9,
                       colsample_bylevel=0.9,
                       colsample_bynode=0.8,
                       n_estimators=50,
                       missing=-999.0)

model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
rmse = MSE(y_pred, y_test)**0.5
rmse

2.7911458213401037

In [26]:
full_pipeline = Pipeline([('null_imputer', NullValueImputer()),  ('sparse', SparseMatrix()),
                          ('xgb', XGBRegressor(max_depth=1, min_child_weight=5, subsample=0.6,
                                               colsample_bytree=0.9, colsample_bylevel=0.9, colsample_bynode=0.8,
                                               missing=-999.0))])
full_pipeline.fit(X, y)

Pipeline(steps=[('null_imputer',
                 <__main__.NullValueImputer object at 0x7fd7a6877af0>),
                ('sparse', <__main__.SparseMatrix object at 0x7fd7a68775b0>),
                ('xgb',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=0.9, colsample_bynode=0.8,
                              colsample_bytree=0.9, gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='',
                              learning_rate=0.300000012, max_delta_step=0,
                              max_depth=1, min_child_weight=5, missing=-999.0,
                              monotone_constraints='()', n_estimators=100,
                              n_jobs=4, num_parallel_tree=1, random_state=0,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                              subsample=0.6, tree_method='exact',
                     

In [31]:
new_data = X_test
full_pipeline.predict(new_data)
np.round(full_pipeline.predict(new_data))

array([13.,  8., 11., 14., 13., 12., 14., 12., 11., 14., 13.,  9., 13.,
       13., 14.,  8., 10., 10., 15., 10., 12., 13.,  7., 13.,  7.,  9.,
       10., 13., 14., 12., 11., 12., 15.,  9., 11., 13., 12., 11.,  8.,
       13., 11., 11., 12., 13., 13., 15., 13., 13., 13., 12., 14.,  6.,
        6., 12., 15.,  9., 13.,  9., 14., 12., 13.,  7.,  9., 12., 14.,
       11., 14., 14., 12., 11., 12., 13., 13.,  7., 14., 12., 13., 15.,
       13., 10., 13.,  7., 11., 12., 13., 10.,  9., 13., 15., 15., 11.,
       10., 14., 12., 13., 14., 12.,  8., 13., 15., 13.,  9., 12., 12.,
       13., 14., 13., 10., 10., 14.,  7., 11., 13., 11., 14., 11., 12.,
       11., 12., 12., 14.,  8., 13., 11., 14., 12., 15., 15., 12., 14.,
       10., 14.,  9.,  9., 12., 13., 10., 12., 14., 13., 10., 13., 13.,
       13., 13., 11., 12., 13., 14., 12.,  8., 10., 12.,  8.,  8., 13.,
       14., 13., 13., 11., 12., 13.,  9.], dtype=float32)

In [34]:
new_df = pd.read_csv('data/student-por.csv')
new_X = df.iloc[:, :-3]
new_y = df.iloc[:, -1]
new_model = full_pipeline.fit(new_X, new_y)

more_new_data = X_test[:25]
np.round(new_model.predict(more_new_data))

array([13.,  8., 11., 14., 13., 12., 14., 12., 11., 14., 13.,  9., 13.,
       13., 14.,  8., 10., 10., 15., 10., 12., 13.,  7., 13.,  7.],
      dtype=float32)

In [35]:
single_row = X_test[:1]
single_row_plus = pd.concat([single_row, X_test[:25]])
print(np.round(new_model.predict(single_row_plus))[:1])

[13.]


In [None]:
from typing import Tuple
import xgboost as xgb

def f1_eval(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y = dtrain.get_label()

    # convert the predicted values from {predt E R | 0<predt<1} to {0, 1} with a threshold of 0.5
    # all values less than 0.5 would be converted to 0 (False) and
    # all values equal or greater than 0.5 would be converted to 1 (True)
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", f1_score(y_true=y, y_pred=predt_binary)


dtrain = xgb.DMatrix(X_train_os_smote, label=y_train_os_smote)
dtest = xgb.DMatrix(X_test_os_smote, label=y_test_os_smote)


def objective(trial):
    # parameters for hypertunning 
    params = {"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
              'n_estimators': trial.suggest_int('n_estimators',400, 600),
              'max_depth': trial.suggest_int('max_depth', 10, 20),
              "eval_metric": "auc",
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
              'subsample' : trial.suggest_uniform('subsample', 0.50, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
              'gamma': trial.suggest_int('gamma', 0, 10),
              "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
              "sample_type": trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
              'objective': 'binary:logistic',}
              #'tree_method': 'gpu_hist',}
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    best = xgb.train(params, dtrain, evals=[(dtest, "validation")], callbacks=[pruning_callback], feval=f1_eval)
    best.save_model('xgb_zindi.model')
    preds = np.rint(best.predict(dtest))
    f1_score_ = f1_score(y_test_os_smote, preds)
    return f1_score_


study = optuna.create_study()
study.optimize(objective,n_trials=100)

print(study.best_trial.params)

best_params = study.best_trial.params
best_params['objective'] = 'binary:logistic'
best_params['missing'] = -999
best_params['random_state'] = 2020
#best_params['tree_method'] = 'gpu_hist'

In [None]:
dtrain = lgb.Dataset(X_train_os_smote, label=y_train_os_smote)

def f1_eval_(predt: np.ndarray, dtrain: lgb.Dataset) -> Tuple[str, float]:
    y = dtrain.get_label()

    # convert the predicted values from {predt E R | 0<predt<1} to {0, 1} with a threshold of 0.5
    # all values less than 0.5 would be converted to 0 (False) and
    # all values equal or greater than 0.5 would be converted to 1 (True)
    predt_binary = np.where(predt > 0.5, 1, 0)
    return "F1_score", f1_score(y_true=y, y_pred=predt_binary)



def objective(trial):

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "feature_pre_filter": False,
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain, feval=f1_eval_,)
    preds = np.rint(gbm.predict(X_test_os_smote))
    accuracy = accuracy_score(y_test_os_smote, preds,)
    f1_score_ = f1_score(y_test_os_smote, preds)
    f1_score(y_test_os_smote, preds)
    return f1_score_

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

best_params_ = study.best_trial.params