## Import Modules, Packages and Third Party Libraries

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# Magic statements.
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Import graph libraries.
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, AutoMinorLocator

# Import main modules, packages, and third party libraries.
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import sklearn

from pprint import pprint

# Import scikit-learn classes: datasets.
from sklearn.datasets import load_iris

# Import scikit-learn classes: preprocessing step utility functions.
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA # Unsupervised Machine Learning tasks: feature reduction, dimensionality reduction
from sklearn.decomposition import KernelPCA
from sklearn.mixture import GaussianMixture # Unsupervised Machine Learning tasks: clustering
from sklearn.manifold import Isomap # Unsupervised Machine Learning tasks: feature reduction, dimensionality reduction

from sklearn.utils import shuffle

# Import scikit-learn classes: models (Estimators).
from sklearn.naive_bayes import GaussianNB           # Non-parametric Generative Model
from sklearn.naive_bayes import MultinomialNB        # Non-parametric Generative Model
from sklearn.linear_model import LinearRegression    # Parametric Linear Discriminative Model
from sklearn.linear_model import LogisticRegression  # Parametric Linear Discriminative Model
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC                          # Parametric Linear Discriminative "Support Vector Classifier"
from sklearn.tree import DecisionTreeClassifier      # Non-parametric Model
from sklearn.ensemble import BaggingClassifier       # Non-parametric Model (Meta-Estimator, that is, an Ensemble Method)
from sklearn.ensemble import RandomForestClassifier  # Non-parametric Model (Meta-Estimator, that is, an Ensemble Method)

# Import scikit-learn classes: Pipeline utility functions.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Import scikit-learn classes: Hyperparameters Validation utility functions.
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve

# Import scikit-learn classes: model's evaluation step utility functions.
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
# from sklearn.metrics import plot_roc_curve

In [None]:
from utils.load_pittburgh_dataset import load_brdiges_dataset

# === UTILS IMPORTS (Done by myself) ==== #
from utils.display_utils import display_heatmap
from utils.display_utils import show_frequency_distribution_predictors
from utils.display_utils import show_categorical_predictor_values
from utils.display_utils import  show_cum_variance_vs_components

from utils.preprocessing_utils import preprocess_categorical_variables
from utils.preprocessing_utils import  preprocessing_data_rescaling

from utils.training_utils import sgd_classifier_grid_search
from utils.training_utils import naive_bayes_classifier_grid_search
from utils.training_utils import svm_linear_classifier_grid_search
from utils.training_utils import decision_tree_classifier_grid_search
from utils.training_utils import random_forest_classifier_grid_search
from utils.training_utils import plot_roc_crossval
from utils.training_utils import fit_by_n_components

from utils.training_utils import kfold_cross_validation
from utils.training_utils import loo_cross_validation
from utils.training_utils import fit
from utils.training_utils import grid_search_estimator

In [None]:
# Global starting variables.
seed = 42
random_state = 1
target_col = 'T-OR-D' # 'T-OR-D' | 'CLEAR-G'

# Instantiate, or create, a random generator object.
rng = np.random.RandomState(seed=seed)

## Data Exploration & Investigation Step

In [None]:
# === READ INPUT DATASET ==== #
dataset = load_brdiges_dataset()

#### Statistics & Heatmap

In [None]:
# sns.pairplot(dataset, hue=target_col, size=1.5)

In [None]:
columns_2_avoid = ['ERECTED', 'LENGTH', 'LOCATION']
# show_frequency_distribution_predictors(dataset, columns_2_avoid)

In [None]:
corr_result = dataset.corr()
# display_heatmap(corr_result)

## Preprocessing Step

In [None]:
columns = dataset.columns
# target_col = 'T-OR-D'

y = np.array(list(map(lambda x: 0 if x == 1 else 1, dataset[target_col].values)), dtype=int)
print(dataset[target_col].value_counts())
X = dataset.loc[:, dataset.columns != target_col]

## Models

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

### Gaussian naive Bayes

In [None]:
model = GaussianNB()           # 2. Instantiate the model w/ hyperparameters.
model.fit(Xtrain, ytrain)      # 3. Fit the model to data. Notice y is not specified.          

y_model = model.predict(Xtest) # 4. Predict sample's class labels
print('Gaussian naive Bayes accuracy score:', accuracy_score(ytest, y_model))
print(f"Gaussian naive Bayes accuracy score (percentage): {accuracy_score(ytest, y_model)*100:.2f}%")

In [None]:
mat = confusion_matrix(ytest, y_model)

sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value')

In [None]:
clf = GaussianNB()
loo = LeaveOneOut()

scores = cross_val_score(clf, Xtrain, ytrain, cv=loo)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
clf = GaussianNB()
loo = LeaveOneOut()

for cv in [3,4,5,10]:
    scores = cross_val_score(clf, Xtrain, ytrain, cv=cv)
    print("CV=%d | Accuracy: %0.2f (+/- %0.2f)" % (cv, scores.mean(), scores.std() * 2))

In [None]:
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
y_model = clf.predict(Xtest) # 4. Predict sample's class labels
print('Gaussian naive Bayes accuracy score:', accuracy_score(ytest, y_model))
print(f"Gaussian naive Bayes accuracy score (percentage): {accuracy_score(ytest, y_model)*100:.2f}%")

mat = confusion_matrix(ytest, y_model)

sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value')

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

clf = GaussianNB()
kfold_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
loo_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
fit(clf, Xtrain, ytrain, Xtest, ytest)

#### Principal Component Analysis (PCA) for Unsupervied Machine Learning Tasks: Dimensionality reduction

In [None]:
model = PCA(n_components=2)    # 2. Instantiate model with hyperparameters
model.fit(X)              # 3. Fit to data. Notice y is not specified
X_2D = model.transform(X) # 4. Transform the data to two dimensional

In [None]:
dataset['PCA1'] = X_2D[:, 0]
dataset['PCA2'] = X_2D[:, 1]

sns.lmplot("PCA1", "PCA2", hue=target_col, data=dataset, fit_reg=False)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

pca = PCA(n_components=2)    # 2. Instantiate model with hyperparameters
pca.fit(Xtrain)              # 3. Fit to data. Notice y is not specified
Xtrain_transformed = pca.transform(Xtrain) # 4. Transform the data to two dimensional
Xtest_transformed = pca.transform(Xtest) # 4. Transform the data to two dimensional

clf = GaussianNB()
loo = LeaveOneOut()

scores = cross_val_score(clf, Xtrain_transformed, ytrain, cv=3)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

pca = PCA(n_components=2)    # 2. Instantiate model with hyperparameters
pca.fit(Xtrain)              # 3. Fit to data. Notice y is not specified
Xtrain_transformed = pca.transform(Xtrain) # 4. Transform the data to two dimensional
Xtest_transformed = pca.transform(Xtest) # 4. Transform the data to two dimensional

clf = GaussianNB()

for cv in [3,4,5,10]:
    scores = cross_val_score(clf, Xtrain_transformed, ytrain, cv=cv)
    print("CV=%d | Accuracy: %0.2f (+/- %0.2f)" % (cv, scores.mean(), scores.std() * 2))

In [None]:
clf = GaussianNB()

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

pca = PCA(n_components=2)    # 2. Instantiate model with hyperparameters
pca.fit(Xtrain)              # 3. Fit to data. Notice y is not specified

Xtrain_transformed = pca.transform(Xtrain) # 4. Transform the data to two dimensional
Xtest_transformed = pca.transform(Xtest) # 4. Transform the data to two dimensional

clf.fit(Xtrain_transformed, ytrain)
y_model = clf.predict(Xtest_transformed) # 4. Predict sample's class labels
print('Gaussian naive Bayes accuracy score:', accuracy_score(ytest, y_model))
print(f"Gaussian naive Bayes accuracy score (percentage): {accuracy_score(ytest, y_model)*100:.2f}%")

mat = confusion_matrix(ytest, y_model)

sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value')

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state)

pca = PCA(n_components=2)    # 2. Instantiate model with hyperparameters
pca.fit(Xtrain)              # 3. Fit to data. Notice y is not specified

Xtrain_transformed = pca.transform(Xtrain) # 4. Transform the data to two dimensional
Xtest_transformed = pca.transform(Xtest) # 4. Transform the data to two dimensional

clf = GaussianNB()
kfold_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
loo_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
fit(clf, Xtrain, ytrain, Xtest, ytest)

In [None]:
model = KernelPCA(n_components=2, \
                  kernel='rbf')          
model.fit(X)                      
X_2D = model.transform(X)         
Xtrain, Xtest, ytrain, ytest = train_test_split(X_2D, y, random_state=random_state)

clf = GaussianNB()
kfold_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
loo_cross_validation(clf, Xtrain, ytrain)

clf = GaussianNB()
fit(clf, Xtrain, ytrain, Xtest, ytest)

In [None]:
fit_by_n_components(
    estimator=GaussianNB(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='GaussianNB', \
    show_plots=True)

In [None]:
fit_by_n_components(
    estimator=LogisticRegression(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='LogisticRegression', \
    show_plots=True)

In [None]:
fit_by_n_components(
    estimator=KNeighborsClassifier(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='KNeighborsClassifier', \
    show_plots=True)

In [None]:
fit_by_n_components(
    estimator=SVC(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='SVC', \
    show_plots=True)

In [None]:
fit_by_n_components(
    estimator=DecisionTreeClassifier(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='DecisionTreeClassifier', \
    show_plots=True)

In [None]:
fit_by_n_components(
    estimator=RandomForestClassifier(), \
    X=X, \
    y=y, \
    n_components=12, \
    clf_type='RandomForestClassifier', \
    show_plots=True)

In [None]:
param_grid = {
    'n_estimators': [50,] ,  # [50, 100, 200, 300,],
    'criterion':  ['gini',], # ['gini','entropy'],
    'max_depth': [None,], # [None, 2, 5, 10],
    'n_jobs': [3],
    'max_features': [None,], # [int, float, None, 'sqrt', 'log2'],
    'bootstrap': [True,] # [True, False]
}
estimator = RandomForestClassifier()

grid_search_estimator(
    estimator=estimator,
    param_grid=param_grid,
    X=X,
    y=y,
    n_components=7,
    clf_type='RandomForestClassifier',
    random_state=0, show_plots=False, show_errors=True)

## References

#### Scikit-Learn Examples

- (Feature transformations with ensembles of trees) https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#sphx-glr-auto-examples-ensemble-plot-feature-transformation-py
- (Receiver Operating Characteristic (ROC) with cross validation) https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py
- (Model selection with Probabilistic PCA and Factor Analysis (FA) https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py
- (ROC Curve with Visualization API) https://scikit-learn.org/stable/auto_examples/plot_roc_curve_visualization_api.html#sphx-glr-auto-examples-plot-roc-curve-visualization-api-py