Import dependancies.

In [None]:
# To mute annoying warnings in notebook
import warnings

# For runtime estimation
import time

# For Data science
import pandas as pd
from sklearn import (
    model_selection,
    ensemble,
    tree,
    linear_model,
    svm,
)

# For visualization
# general
import seaborn as sns

# Math plot
import matplotlib.pyplot as plt

# For timing
# For modules
from sources import (
    check_is_na,
    get_dataframe_scaled,
)

# Dealing with classification with imbalanced classes
from imblearn import (
    over_sampling,
)

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
data = pd.read_csv(
    "../data/data.csv",
    delimiter=",",
)

The dataset contains second target - Time of verification. I don't see ane sense to use the time as target for classification. I drop it.

In [None]:
# Drop second target
# data.drop(columns=['verification.time'], inplace=True)

# Rename columns to get short labels
data.columns = ["b1", "b2", "b3", "b4", "price", "product", "winner", "result", "time"]

In [None]:
# Get info about data
data.info()

In [None]:
# Get stat for dataset
data.describe()

In [None]:
# Check for missing values
check_is_na(data)

Visualize correlation of data features.

In [None]:
# Show correlation between targets
sns.pairplot(data=data[["result", "price"]]);

Yes, pair plot is not a good way to get correlation between categorical features, but for time and result it partially shows that there is some relation. Let's discover it with more appropriate tool.

Make categorical feature from continuous feature 'time'

In [None]:
# Get mean time
mean_time = data.time.mean()

# Make feature as categorical
data["duration"] = data.time.apply(lambda x: "big" if x > mean_time else "small")

Count how many occurrences there are of each combination

In [None]:
# Count occurrences
contingency_table = pd.crosstab(data.result, data.duration)

# Show occurrences table
contingency_table

In [None]:
# Show occurrences diagram
sns.heatmap(contingency_table);

It looks like there is a correlation. It worth it to take it into account before classification

# Feature engineering
Make a new feature by combining correlated targets

In [None]:
# Combine targets
data["combined_target"] = data.duration + "_" + data.result.astype(str)

# Get stat
data.combined_target.describe()

In [None]:
# Show occurrences
sns.catplot(data=data, x="combined_target", kind="count");

Dataset looks imbalanced by target. Let's balance it.

In [None]:
data.drop(columns=["result", "time", "duration"], inplace=True)

In [None]:
# Scale dataset
scaled_df = get_dataframe_scaled(dataset=data, omit_feature_names=["combined_target"])

In [None]:
# Make feature subset
X = data.drop("combined_target", axis=1)

# Make target subset
y = data.combined_target

In [None]:
# Initialize random over sampler
random_over_sampler = over_sampling.RandomOverSampler(
    random_state=0,
    sampling_strategy="not majority",
)

# Resample data
X_resampled, y_resampled = random_over_sampler.fit_resample(X, y)

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X_resampled, y_resampled, test_size=0.33, random_state=42
)

In [None]:
# Show resampled target
sns.catplot(
    data=y_resampled.to_frame(),
    x="combined_target",
    kind="count",
);

# Classification with default hyperparameters

## BaggingClassifier:

fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction

In [None]:
start_time = time.time()

# Get classifier with base estimator as a DecisionTreeClassifier and default number of estimators 10
bagging_classifier = ensemble.BaggingClassifier(
    random_state=0,
).fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

# Get score
bagging_score = bagging_classifier.score(X_test, y_test)

print(f"Bagging_score score is {bagging_score:.2f}")

In [None]:
# Get predicted values
y_predicted_bagging = bagging_classifier.predict(X_test)

In [None]:
# Make dataframe from test and predicted values
data_to_compare = pd.DataFrame(
    {"values from test": y_test, "predicted values": y_predicted_bagging}
)

# Visualize amount of predicted values
sns.barplot(
    x="value",
    y="index",
    hue="variable",
    data=pd.melt(data_to_compare.reset_index(), id_vars="index"),
)

plt.title("Distribution of observations in test dataset and and predicted dataset");

## Histogram-Based Gradient Boosting:

builds an additive model in a forward stage-wise fashion

In [None]:
start_time = time.time()

# Get classifier
boosting_classifier = ensemble.HistGradientBoostingClassifier(random_state=0).fit(
    X_train, y_train
)

print("--- %s seconds ---" % (time.time() - start_time))

# Get score
boosting_score = boosting_classifier.score(X_test, y_test)

print(f"Boosting score is {boosting_score:.2f}")

In [None]:
# Get predicted values
y_predicted_boosting = boosting_classifier.predict(X_test)

In [None]:
# Make dataframe from test and predicted values
data_to_compare = pd.DataFrame(
    {"values from test": y_test, "predicted values": y_predicted_boosting}
)

# Visualize amount of predicted values
sns.barplot(
    x="value",
    y="index",
    hue="variable",
    data=pd.melt(data_to_compare.reset_index(), id_vars="index"),
)

plt.title("Distribution of observations in test dataset and and predicted dataset");

## Stacked generalization:

The predictions of each individual estimator are stacked together and used as input to a final estimator to compute the prediction. This final estimator is trained through cross-validation.

In [None]:
start_time = time.time()

# Get estimator
estimators = [
    ("rf", ensemble.RandomForestClassifier(n_estimators=10, random_state=0)),
]

# Get classifier
stacking_classifier = ensemble.StackingClassifier(estimators=estimators).fit(
    X_train, y_train
)

print("--- %s seconds ---" % (time.time() - start_time))

# Get score
stacking_score = stacking_classifier.score(X_test, y_test)

print(f"Stacking score is {stacking_score:.2f}")

In [None]:
# Get predicted values
y_predicted_stacking = stacking_classifier.predict(X_test)

In [None]:
# Make dataframe from test and predicted values
data_to_compare = pd.DataFrame(
    {"values from test": y_test, "predicted values": y_predicted_stacking}
)

# Visualize amount of predicted values
sns.barplot(
    x="value",
    y="index",
    hue="variable",
    data=pd.melt(data_to_compare.reset_index(), id_vars="index"),
)

plt.title("Distribution of observations in test dataset and and predicted dataset");

# Classification with selection of hyperparameters

## BaggingClassifier:

In [None]:
# Get parameter grid
parameter_grid = {
    "base_estimator": [
        tree.DecisionTreeClassifier(),
        linear_model.LogisticRegression(),
    ],
    "n_estimators": [5, 10, 20],
    "max_samples": [0.5, 0.7, 0.9],
    "max_features": [0.5, 0.7, 0.9],
}

# Get grid search
greed_search = model_selection.GridSearchCV(
    estimator=bagging_classifier,
    param_grid=parameter_grid,
    cv=5,
)

start_time = time.time()

# Fit grid search
greed_search.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

# Get grid search metrics
best_parameters = greed_search.best_params_
best_bagging_estimator = greed_search.best_estimator_
accuracy = best_bagging_estimator.score(X_test, y_test)
best_score = greed_search.best_score_

print(f"Best parameters: {best_parameters}")
print(f"Best bagging estimator: {best_bagging_estimator}")
print(f"Best score: {best_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

Using SVC returns warning! Something is wrong with bagging-SVC!

## Boosting:

In [None]:
# Get parameter grid
parameter_grid = {
    "early_stopping": [True, False],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_iter": [100, 200, 300],
    "max_depth": [3, 4, 5],
}

# Get grid search
greed_search = model_selection.GridSearchCV(
    estimator=boosting_classifier,
    param_grid=parameter_grid,
    cv=5,
)

start_time = time.time()

# Fit grid search
greed_search.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

# Get grid search metrics
best_parameters = greed_search.best_params_
best_bagging_estimator = greed_search.best_estimator_
accuracy = best_bagging_estimator.score(X_test, y_test)
best_score = greed_search.best_score_

print(f"Best parameters: {best_parameters}")
print(f"Best boosting estimator: {best_bagging_estimator}")
print(f"Best score: {best_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

## Stacked generalization

In [None]:
# Get parameter grid
parameter_grid = {
    "estimators": [
        [
            ("lr", linear_model.LogisticRegression()),
            ("rf", ensemble.RandomForestClassifier()),
        ],
        [("lr", linear_model.LogisticRegression()), ("svm", svm.SVC(probability=True))],
        [("rf", ensemble.RandomForestClassifier()), ("svm", svm.SVC(probability=True))],
    ],
    "final_estimator": [
        linear_model.LogisticRegression(),
        ensemble.RandomForestClassifier(),
    ],
    "cv": [3, 5],
    "stack_method": ["auto", "predict_proba"],
}

# Get grid search
greed_search = model_selection.GridSearchCV(
    estimator=stacking_classifier,
    param_grid=parameter_grid,
    cv=5,
)

start_time = time.time()

# Fit grid search
greed_search.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

# Get grid search metrics
best_parameters = greed_search.best_params_
best_bagging_estimator = greed_search.best_estimator_
accuracy = best_bagging_estimator.score(X_test, y_test)
best_score = greed_search.best_score_

print(f"Best parameters: {best_parameters}")
print(f"Best stacking estimator: {best_bagging_estimator}")
print(f"Best score: {best_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

## Summary
1. Bagging, boosting and stacking methods used for classification.
2. The quality of classification is good enough for all methods with default hyperparameters.
3. Bagging method is faster and boosting method is slower with default hyperparameters.
4. The best combination of the chosen hyperparameters for each method found.