In [1]:
# Basic libraries
import numpy as np
import pandas as pd

#Regression Techniques
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#Classfication Techniques
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#Ensemble Techniques
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

import xgboost as xgb

# Model Evaluation
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.metrics import r2_score
from sklearn.metrics import recall_score
from sklearn.metrics import balanced_accuracy_score #average aaccuracy of each class - imbalanced
from sklearn.metrics import roc_auc_score

from sklearn.decomposition import PCA

#Over Sampling
from imblearn.over_sampling import SMOTE

#UnderSampling

from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import RUSBoostClassifier

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from statsmodels.api import add_constant

#Data Preprocessing

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Data Visualisation

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
## Simple Linear Regression

lr = LinearRegression()
lr.fit(x_train,y_train)
print('Test',lr.score(x_test,y_test))
print('Train',lr.score(x_train,y_train))

## Ridge 

ridge = Ridge(alpha=1)
ridge.fit(x_train,y_train)
print('Test',ridge.score(x_test, y_test))
print('Train',ridge.score(x_train, y_train))

## Lasso

lasso = Lasso(alpha=1)
lasso.fit(x_train,y_train)
print('Test',lasso.score(x_test, y_test))
print('Train',lasso.score(x_train, y_train))

print('Linear Regression Co-efficients:\n',lr.coef_,'\n\n')
print('Ridge Regression Co-efficients:\n',ridge.coef_,'\n\n')
print('Lasso Regression Co-efficients:\n',lasso.coef_,'\n\n')

In [None]:
#Polynomial Feautures

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)

In [None]:
#Confusion Matrix

cm = metrics.confusion_matrix(y_test, y_pred_class)
plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['Non_diabetic','Diabetic']
plt.title('Confusion Matrix - Test Data')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
 
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

#Accuracy:
print((TP + TN) / float(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred_class))

#Error
print((FP + FN) / float(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred_class))

#Recall
print(TP / float(TP + FN))
print(metrics.recall_score(y_test, y_pred_class))

#Specificity
print((TN / float(TN + FP)))

#FPR:
print(FP / float(TN + FP))

#Precision:
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
# predict diabetes if the predicted probability is greater than 0.3
#Changing Threshold

from sklearn.preprocessing import binarize
y_pred_class = binarize([y_pred_prob], 0.3)[0]  # deciding the class of the 1st 10 records based on new threshold

In [None]:
# IMPORTANT: ROC_AUC

import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
#Metric

# calculate cross-validated AUC

cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

In [None]:
#Modeling - Not ensemble

#After applying dimensionality reduction using p-value,...


#Logistic Regression

print('Logistic Regression:')
lr = LogisticRegression(random_state=1)
lr.fit(x_train,y_train)
print('Training:\t',lr.score(x_train,y_train))
print('Validation:\t',lr.score(x_val,y_val))
print('Testing:\t',lr.score(x_test,y_test))

#Decision Tree

print('\nDecision Tree:')
dt = DecisionTreeClassifier(random_state=1)
dt.fit(x_train,y_train)
print('Training:\t',dt.score(x_train,y_train))
print('Validation:\t',dt.score(x_val,y_val))
print('Testing:\t',dt.score(x_test,y_test))

#Naive Bayes

print('\nNaive Bayes:')
nb = GaussianNB()
nb.fit(x_train,y_train)
print('Training:\t',nb.score(x_train,y_train))
print('Validation:\t',nb.score(x_val,y_val))
print('Testing:\t',nb.score(x_test,y_test))

In [None]:
# configure bootstrap
values=np.array(values)
n_iterations = 50           # Number of bootstrap samples to create
n_size = int(len(values) * 0.50)    # picking only 50 % of the given data in every bootstrap sample

# run bootstrap
stats = list()
for i in range(n_iterations):
	# prepare train and test sets
	train = resample(values, n_samples=n_size)  # Sampling with replacement
	test = np.array([x for x in values if x.tolist() not in train.tolist()])  # picking rest of the data not considered in sample
    # fit model
	model = lr
	model.fit(train[:,:-1],train[:,-1])
    # evaluate model    
	score = model.score(test[:,:-1], test[:,-1])   # caution, overall accuracy score can mislead when classes are imbalanced
	print(score)
	stats.append(score)

# plot scores
plt.xticks()
plt.hist(stats)
plt.show()
# confidence intervals
alpha = 0.95                             # for 95% confidence 
p = ((1.0-alpha)/2.0) * 100              # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))  
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

In [None]:
#Modelling - Ensemble

#Model that we selected
#Decision Tree

print('\nDecision Tree:')
dt = DecisionTreeClassifier(random_state=1,max_depth=5)
dt.fit(x_train,y_train)
print('Training:\t',dt.score(x_train,y_train))
print('Validation:\t',dt.score(x_val,y_val))
print('Testing:\t',dt.score(x_test,y_test))

#Random Forest

print('\nRandom Forest:')
rf = RandomForestClassifier(random_state=1,n_estimators=50)
rf.fit(x_train,y_train)
print('Training:\t',rf.score(x_train,y_train))
print('Validation:\t',rf.score(x_val,y_val))
print('Testing:\t',rf.score(x_test,y_test))

#Bagging Classifier

print('\nBagging Classifier:')
bc = BaggingClassifier(random_state=1,n_estimators=50)
bc.fit(x_train,y_train)
print('Training:\t',bc.score(x_train,y_train))
print('Validation:\t',bc.score(x_val,y_val))
print('Testing:\t',bc.score(x_test,y_test))

#AdaBoosting Classifier

print('\nAdaBoost Classifier:')
ab = AdaBoostClassifier(random_state=1,n_estimators=50)
ab.fit(x_train,y_train)
print('Training:\t',ab.score(x_train,y_train))
print('Validation:\t',ab.score(x_val,y_val))
print('Testing:\t',ab.score(x_test,y_test))

#GradientBoosting Classifier

print('\nGradientBoosting Classifier:')
gb = GradientBoostingClassifier(random_state=1,n_estimators=50)
gb.fit(x_train,y_train)
print('Training:\t',gb.score(x_train,y_train))
print('Validation:\t',gb.score(x_val,y_val))
print('Testing:\t',gb.score(x_test,y_test))

#XGBoost

print('\nXGBoost:')
xg = xgb.XGBRegressor()
xg.fit(x_train,y_train)
print('Training:\t',xg.score(x_train,y_train))
print('Validation:\t',xg.score(x_val,y_val))
print('Testing:\t',xg.score(x_test,y_test))

In [None]:
#Model Comparison

models=[]
models.append(('Logistic Regression',lr))
models.append(('DecisionTree',dt))
models.append(('RandomForest',rf))
models.append(('AdaBoost',ab))
models.append(('Gradient',gb))

results=[]
names=[]

scoring='accuracy'
for name,model in models:
    kfold=model_selection.KFold(n_splits=5,random_state=1)
    cv_results=model_selection.cross_val_score(model,X,Y,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg="%s: %f (%f)"%(name,cv_results.mean(),cv_results.std())
    print(msg)

fig=plt.figure()
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#Column Transformer

ct = ColumnTransformer([("scaling", StandardScaler(), ['age' , 'hours-per-week']), ("OneHotCoding", OneHotEncoder(sparse=False),
                                                                                  ['workclass', 'education', 'gender', 'occupation'])])

In [None]:
#Bin Discretizer

from sklearn.preprocessing import KBinsDiscretizer

kb = KBinsDiscretizer(n_bins=10, strategy='uniform')

kb.fit(X)

print("bin edges \n", kb.bin_edges_)

# With the bins defined, we can transform each data point X into a bin using the transform function

X_binned = kb.transform(X)

print(X_binned.toarray()[0,:])
print(pd.DataFrame(X).head(1))   #First data point -0.752759 is stored in bin 4

In [None]:
# To prevent sparse matrix and create dense matrix, let us re-do the binning with onehot coding. A given input belongs to
# one bin and not others. Hence, we can onehot code them

kb = KBinsDiscretizer( n_bins = 10, strategy ='uniform', encode ='onehot-dense') 
kb.fit(X) 
X_binned = kb.transform(X)

In [None]:
#Random SearchCV

# build a classifier
clf = RandomForestClassifier(n_estimators=50)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
samples = 10  # number of random samples 
randomCV = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=samples) #default cv = 3

randomCV.fit(X, y)

print(randomCV.best_params_)
#print(randomCV.cv_results_)

In [None]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print(grid_search.best_params_)
grid_search.cv_results_['mean_test_score']

In [None]:
# Xgb takes data in matrix form both for training and testing...

DM_train = xgb.DMatrix(data = X_train, 
                       label = y_train)  
DM_test =  xgb.DMatrix(data = X_test,
                       label = y_test)

In [None]:
# setting the hyper parameters ... Ref https://xgboost.readthedocs.io/en/latest/python/python_api.html

gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),  # generate 5 numbers between .5 and .9 
     'n_estimators':[10, 200],
     'max_depth': [10, 15, 20, 25]
}

gbm = xgb.XGBRegressor()
grid_mse = GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

pred = grid_mse.predict(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred)), 2)))

test = pd.DataFrame({"prediction": pred, "observed": y_test.flatten()})
lowess = sm.nonparametric.lowess
z = lowess(pred.flatten(), y_test.flatten())
test.plot(figsize = [14,8],
          x ="prediction", y = "observed", kind = "scatter", color = 'darkred')
plt.title("Extreme Gradient Boosting: Prediction Vs Test Data", fontsize = 18, color = "darkgreen")
plt.xlabel("Predicted Power Output", fontsize = 18) 
plt.ylabel("Observed Power Output", fontsize = 18)
plt.plot(z[:,0], z[:,1], color = "blue", lw= 3)
plt.show()

In [None]:
#For each model

# configure bootstrap
n_iterations = 100           # Number of bootstrap samples to create
n_size = int(len(data) * 0.50)    # picking only 50 % of the given data in every bootstrap sample

# run bootstrap
stats = list()
for i in range(n_iterations):
	# prepare train and test sets
	train = resample(values, n_samples=n_size)  # Sampling with replacement 
	test = np.array([x for x in values if x.tolist() not in train.tolist()])  # picking rest of the data not considered in sample
    # fit model
	model = DecisionTreeClassifier()
	model.fit(train[:,:-1], train[:,-1])
    # evaluate model
	predictions = model.predict(test[:,:-1])
	score = accuracy_score(test[:,-1], predictions)    # caution, overall accuracy score can mislead when classes are imbalanced
	print(score)
	stats.append(score)


In [None]:
# plot scores
pyplot.xticks()
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95                             # for 95% confidence 
p = ((1.0-alpha)/2.0) * 100              # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))  
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))