Import dependencies

In [None]:
import warnings

# For Data science
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.plotting import hist_frame

from sklearn import (
    model_selection,
    svm,
)

from imblearn import (
    over_sampling,
    under_sampling,
    combine,
)

# For modules
from sources import (
    check_is_na,
    get_dataframe_scaled,
    three_sigma_cleared,
    get_kde_comparison,
    get_model_score,
)

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Set column names
column_names = [
    "id",
    "RI",
    "Na",
    "Mg",
    "Al",
    "Si",
    "K",
    "Ca",
    "Ba",
    "Fe",
    "glass_type",
]

# Get dataset from file
data = pd.read_csv(
    "../data/glass.data",
    delimiter=",",
    names=column_names,
)

Look for missing values

In [None]:
# Check if the dataset has missing values
check_is_na(data)

Remove extra feature and show statistics

In [None]:
# Get columns without id column
df = data.iloc[:, 1:]

# Show statistics summary
df.describe()

Show dataset info

In [None]:
# Get dataset info
df.info()

Show size

In [None]:
# Show dataset size
df.shape

# Analyse class imbalance
Let's look on class counts

In [None]:
df.groupby("glass_type").size()

In [None]:
# Show counts for classes in figure
hist_frame(
    data=df,
    column="glass_type",
);

There is an imbalance in dataset. Number of occurrences for each classe differs.

In [None]:
# Melt dataframe to show chemical composition
melted_df = pd.melt(
    df.drop(columns=["Si"]),
    id_vars=["glass_type"],
    var_name="Chemical",
    value_name="Composition",
)

# Plot bar plot
plt.figure(figsize=(12, 8))
sns.barplot(
    data=melted_df,
    x="glass_type",
    y="Composition",
    hue="Chemical",
    log_scale=False,
)

plt.title("Chemical Composition (%) of Different Glass Types")

Let's scale and clear dataset to have the common value order.

In [None]:
# Scale dataset
scaled_df = get_dataframe_scaled(dataset=df, omit_feature_name="glass_type")

# Remove values with deviation more than 3 sigma
cleared_df = three_sigma_cleared(
    dataset=scaled_df, feature_names=column_names[1:], sigmas=3
)

cleared_df.describe()

In [None]:
# Show counts for classes in figure
hist_frame(
    data=df,
    column="glass_type",
);

# Classification of imbalanced dataset

Get feature and target

In [None]:
X = cleared_df.drop("glass_type", axis=1)
y = cleared_df["glass_type"]

Get dataset split

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.33, random_state=42
)

Set classifier

In [None]:
# Initiate classification
classifier = svm.LinearSVC(dual="auto")

# Train model
classifier.fit(X_train, y_train);

Get result score

In [None]:
# # Get cross-validation results
get_model_score(y_data=y_train, x_data=X_train, modeler=classifier)

Test scores are similar but look poor.

In [None]:
# Get density plot
get_kde_comparison(y_data=y_test, x_data=X_test, modeler=classifier)

Minority classes under-predicted.

# Balance classes

1. Random-over-Resampler

In [None]:
# Initialize random over sampler
random_over_sampler = over_sampling.RandomOverSampler(
    random_state=0,
    sampling_strategy="not majority",
)

# Resample data
X_resampled, y_resampled = random_over_sampler.fit_resample(X_train, y_train)

In [None]:
# Get distribution plot for non-resampled target
sns.displot(data=pd.DataFrame(y_train), kde=True);

In [None]:
# Get distribution plot for resampled target
sns.displot(data=pd.DataFrame(y_resampled), kde=True);

Target distribution balanced but has zero count for class 4. It has more samples than original dataset. Let's get estimation.

In [None]:
# Get cross-validation results for data
get_model_score(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

In [None]:
# Get density plot
get_kde_comparison(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

Looks better.

2. Under-sampling

In [None]:
# Initialize random over sampler
claster_centroid_sampler = under_sampling.ClusterCentroids(
    random_state=0,
    sampling_strategy="auto",
)

# Resample data
X_resampled, y_resampled = claster_centroid_sampler.fit_resample(X_train, y_train)

# Get distribution plot for resampled target
sns.displot(data=pd.DataFrame(y_resampled), kde=True);

Target distribution balanced but has zero count for class 4. It has less samples than original dataset. Let's get estimation.

In [None]:
# Get cross-validation results for data
get_model_score(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

In [None]:
# Get density plot
get_kde_comparison(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

3. Combining: Over-sampling using SMOTE and cleaning using Tomek links.

In [None]:
smote_over_sampler = over_sampling.SMOTE(k_neighbors=4)
tomek_under_sampler = under_sampling.TomekLinks()

smote_tomek_sampler = combine.SMOTETomek(
    random_state=0, smote=smote_over_sampler, tomek=tomek_under_sampler
)

# Resample data
X_resampled, y_resampled = smote_tomek_sampler.fit_resample(X_train, y_train)

# Get distribution plot for resampled target
sns.displot(data=pd.DataFrame(y_resampled), kde=True);

In [None]:
# Get cross-validation results for data
get_model_score(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

In [None]:
# Get density plot
get_kde_comparison(y_data=y_resampled, x_data=X_resampled, modeler=classifier)

# Summary

1. Target category in given dataset has imbalanced classes.
2. Class imbalance leads to poor model performance.
3. Balancing helps to improve performance.
4. Under-sempler gave more improvement.