# Part A > Time Series Regression

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Miscellaneous Dependencies
from typing import Union, List, Tuple, Dict, Callable
import warnings

In [None]:
# Hide all warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Time Series Dependencies
import statsmodels
import statsmodels.tsa as time_series_analysis
from pmdarima.arima import arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

## Import Data

In [None]:
df = pd.read_csv('./data/train.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(keys='Date', inplace=True)
df.sort_index(inplace=True)
df

In [None]:
df = df[df['Value'] > 0]
df

In [None]:
df.groupby(by=[df.index.year, df.index.quarter]).count()

In [None]:
def groupby_date(df: pd.DataFrame, freq: str = 'D'):
    df_tmp = df.copy()
    gbo = df_tmp.resample(rule=freq).mean()
    return gbo

In [None]:
# def drop_unreasonable_values(df: pd.DataFrame, mask: np.ndarray):
#     print(df[mask].shape, df[~mask].shape)
#     return df[mask]

In [None]:
def drop_sparse_data(df, threshold: int = 20, freq: str = 'days'):
    from datetime import timedelta
    index = df.index
    x = 0
    drop_rows = []
    for i in df.index:
        for n in range(1, threshold + 1):
            if (i + timedelta(**{freq: n})) not in index:
                drop_rows.append(i)
                break
        x += 1
    return df.drop(drop_rows, axis=0)

CO = df[df['Gas'] == 'CO']
CO.groupby(by=[CO.index.year, CO.index.quarter]).count()
print(f'''Dropped {CO.shape[0] - drop_sparse_data(CO, threshold=14).shape[0]} rows
Last 5 Rows of CO:
{drop_sparse_data(CO, threshold=14).iloc[-5:]}''')

In [None]:
def impute_missing_values(df: pd.DataFrame, method: Union[str, int] = 'time'):
    if type(method) is int:
        return df.rolling(window=method).mean()
    else:
        return df.interpolate(method=method)

In [None]:
def get_df_partitioned_by_gas(df: pd.DataFrame, resample_freq: str = 'D'):
    gas_dict = {}
    for gas in df['Gas'].unique():  # type: ignore
        gbo = drop_sparse_data(df[df['Gas'] == gas], threshold=3)
        gbo = groupby_date(df=gbo, freq=resample_freq)
        # print(gbo['Value'].mean(), gbo['Value'].std())
        dummy_date_range = pd.date_range(start=gbo.index[0], end=gbo.index[-1], freq=resample_freq)
        dummy_date_frame = pd.DataFrame(index=dummy_date_range, data=[gbo[date]['Value'] if np.isin(date, gbo.index) else np.nan for date in dummy_date_range])
        gbo = pd.merge(left=gbo, right=dummy_date_frame, left_index=True, right_index=True, how='right')
        gbo = impute_missing_values(gbo.drop(columns=0))
        gbo['Gas'] = [gas] * gbo.shape[0]
        gas_dict[gas] = gbo
    return gas_dict

df_partitioned = get_df_partitioned_by_gas(df)
df_partitioned['CO']

In [None]:
def get_general_trends(df_dict: Dict[str, pd.DataFrame], ma_windows: List[int] = [1, 7, 30]):
    cells = len(df_dict.keys())
    fig, ax = plt.subplots(nrows=cells // 2, ncols=cells // 2, figsize=(12, 8))
    for i, gas in enumerate(df_dict.keys()):
        df = df_dict[gas]
        for w in ma_windows:
            df_tmp = df.rolling(window=w).mean()
            ax[i // 2, i % 2].plot(df_tmp.index, df_tmp['Value'], label=f'{w}-day ma')
    ax[0, 0].legend()

get_general_trends(df_dict=df_partitioned, ma_windows=[30])

In [None]:
def get_plots_by_gas(df_dict: pd.DataFrame):
    def get_plot(df: pd.DataFrame, ax):
        ax.plot(df['T'], label='T')
        ax.plot(df['RH'], label='RH')
        ax.plot([], [], color='g', label='Value')

        ax2 = ax.twinx()
        ax2.plot(df['Value'], color='green')
        # ax.set_xticklabels(labels=df.index.month)

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
    gg = [(0, 0), (0, 1), (1, 0), (1, 1)]
    
    for i, gas in enumerate(df_dict.keys()):
        current_ax = ax[(i // 2, i % 2)]
        df_tmp = df_dict[gas]
        get_plot(df_tmp, ax=current_ax)
        current_ax.set_title(gas)
        cc = current_ax.get_xticks()
        # current_ax.set_xticklabels(tick_labels=pd.to_datetime(current_ax.get_xticks()).year())
    
    ax[0, 0].legend()
    fig.tight_layout()

get_plots_by_gas(df_dict=df_partitioned)

In [None]:
def get_gas_distributions(df: pd.DataFrame):
    counts = df.groupby(by='Gas').count().median(axis=1).astype(int)
    sns.barplot(x=counts.index, y=counts, palette='deep')

get_gas_distributions(df)

In [None]:
def get_gas_means_and_medians(df):
    # fig, ax = plt.subplots(nrows=2, ncols=2)
    gb = df.groupby(by='Gas', as_index=False)
    mean_ = gb.mean()
    mean_['Type'] = ['Mean'] * mean_.shape[0]
    types = ['Mean', 'Median']
    comb_df = pd.DataFrame()
    for i, frame in enumerate([gb.mean(), gb.median()]):
        tmp_df = frame
        tmp_df['Type'] = types[i]
        comb_df = pd.concat(objs=(comb_df, tmp_df), axis=0)
    # print(comb_df)
    sns.barplot(data=comb_df, x='Gas', y='Value', hue='Type', palette='rainbow')
get_gas_means_and_medians(df)

In [None]:
from statsmodels.tsa.stattools import pacf, acf
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

def cf_summary(df_dict: Dict[str, pd.DataFrame], kind: Union['pacf', 'acf'], resample: str = 'M', threshold: float = 0.01, plot: bool = False):
    corr_fn = pacf if kind == 'pacf' else acf
    plot_fn = plot_pacf if kind == 'pacf' else plot_acf
    gases = []
    best_ts = []
    t_values_2 = []
    for gas in df_dict.keys():
        current_gas = gas
        df = df_dict[gas]
        gases.append(current_gas)
        lag_space = min(df.count()[0] // 2 - 1, 50)
        # lag_space = 20
        corr_scores, conf_intvs = corr_fn(df['Value'], nlags=lag_space, alpha=threshold)
        lower_conf_bound = conf_intvs[:, 0] - corr_scores
        upper_conf_bound = conf_intvs[:, 1] - corr_scores
        t_values = np.where((corr_scores < lower_conf_bound) | (corr_scores > upper_conf_bound))[0][1:]
        best_t = t_values[np.argmax((np.abs(corr_scores[t_values])))] if len(t_values) > 0 else 0
        best_ts.append(best_t)
        t_values_2.append(t_values)
        if plot:
            plot_fn(df['Value'], lags=lag_space, alpha=threshold).get_axes()[0].set_title(f'{current_gas}: possible t-values: {tuple(t_values)} Best t: {best_t}')
    t_dict = {}
    for i, gas in enumerate(gases):
        t_dict[gas] = t_values_2[i]
    return t_dict, pd.DataFrame(data={
        'Gas': gases,
        't': best_ts
    }).set_index(keys='Gas')

In [None]:
def cf_summary_ind(df: pd.DataFrame, kind: Union['pacf', 'acf'], resample: str = 'M', threshold: float = 0.01, plot: bool = False):
    corr_fn = pacf if kind == 'pacf' else acf
    plot_fn = plot_pacf if kind == 'pacf' else plot_acf
    # lag_space = df.count() // 2- 1
    lag_space = min(12, df.count() // 2 - 1)
    corr_scores, conf_intvs = corr_fn(df, nlags=lag_space, alpha=threshold)
    lower_conf_bound = conf_intvs[:, 0] - corr_scores
    upper_conf_bound = conf_intvs[:, 1] - corr_scores
    t_values = np.where((corr_scores < lower_conf_bound) | (corr_scores > upper_conf_bound))[0][1:]
    best_t = t_values[np.argmax((np.abs(corr_scores[t_values])))] if len(t_values) > 0 else 0
    if plot:
        plot_fn(df, lags=lag_space, alpha=threshold).get_axes()[0].set_title(f'possible t-values: {tuple(t_values)} Best t: {best_t}')
    t_dict = {}
    return t_values, best_t

In [None]:
def get_stationarity_summary(df_dict: Dict[str, pd.DataFrame], plot: bool = False):
    def get_stationarity_of_columns(df: pd.DataFrame, gas: str, significance_level: float = 0.05):
        p_value = round(adfuller(df['Value'])[1], 5)
        # if p_value < significance_level:
        #     print(f'Stationary ({p_value})')
        # else:
        #     print(f'Non-Stationary ({p_value})')
        decomposition = seasonal_decompose(df['Value'])
        p_values, best_p = cf_summary_ind(decomposition.seasonal, kind='pacf')
        q_values, best_q = cf_summary_ind(decomposition.seasonal, kind='acf')
        if plot:
            ax = decomposition.plot().get_axes()[0]
            ax.set_title(gas)
        return p_value < significance_level, p_value, p_values, best_p, q_values, best_q

    stationarity_dict = {}
    for gas in df_dict.keys():
        stationary, p_value, p_values, best_p, q_values, best_q = get_stationarity_of_columns(df_dict[gas], gas)
        stationarity_dict[gas] = {
            'stationary': stationary,
            'stationarity p': p_value,
            'p_values': p_values,
            'best_p': best_p,
            'q_values': q_values,
            'best_q': best_q
        }
    return stationarity_dict

stats = get_stationarity_summary(df_partitioned)
stats

In [None]:
# baseline prediction, since they are all stationary
test_set = pd.read_csv(filepath_or_buffer='./data/test.csv', sep=',')
pd.merge(left=test_set, right=df.groupby(by='Gas').mean(), left_on='Gas', right_index=True, how='inner')[['id', 'Value']].set_index(keys='id').to_csv('./out/using_mean.csv', sep=',')

In [None]:
def get_long_term_trend(df_dict: Dict[str, pd.DataFrame]):
    value_dict = {}
    for gas in df_dict.keys():
        df = df_dict[gas].rolling(window=30).mean()['Value'].dropna(axis=0)
        stationary = adfuller(df)[1] < 0.05
        p_values, best_p = cf_summary_ind(df, kind='pacf', threshold=0.05)
        q_values, best_q = cf_summary_ind(df, kind='acf', threshold=0.05)
        value_dict[gas] = {
            'p': best_p,
            'q': best_q,
            'stationary': stationary
        }
    return value_dict

get_long_term_trend(df_partitioned)

In [None]:
def combine(df_dict: Dict[str, pd.DataFrame]):
    stationarity_summary = get_stationarity_summary(df_dict)
    lt_stationarity_summary = get_long_term_trend(df_dict)
    orders = {}
    # m_values = (12, 12, 2, 4)
    for i, gas in enumerate(df_dict.keys()):
        p = stationarity_summary[gas]['best_p']
        d = 0 if stationarity_summary[gas]['stationary'] else 1
        q = stationarity_summary[gas]['best_q']

        P = lt_stationarity_summary[gas]['p']
        D = 0 if lt_stationarity_summary[gas]['stationary'] else 1
        Q = lt_stationarity_summary[gas]['q']

        # m = m_values[i]
        m = 12
        orders[gas] = ((p, d, q), (P, D, Q, m))
    return orders

combine(df_dict=df_partitioned)

In [None]:
from sklearn.metrics import mean_squared_error

def finale(df_dict: Dict[str, pd.DataFrame], test_size: float = 0.75):
    orders = combine(df_dict=df_dict)
    # print(orders)
    models = {}
    for gas in df_dict.keys():

        data = df_dict[gas]['Value']
        partition_index = int(data.shape[0] * test_size)

        train = data[:partition_index]
        test = data[partition_index:]

        model = SARIMAX(train, order=orders[gas][0], seasonal_order=orders[gas][1]).fit()
        
        train_pred = model.predict(start=0, end=partition_index - 1)
        test_pred = model.forecast(steps=data.shape[0] - partition_index)

        train_err = mean_squared_error(train, train_pred, squared=False)
        test_err = mean_squared_error(test, test_pred, squared=False)

        models[gas] = {
            'train_true': train,
            'train_pred': train_pred,
            'train_rmse': train_err,
            'test_true': test,
            'test_pred': test_pred,
            'test_rmse': test_err
        }
    return models

gg = finale(df_partitioned)

In [None]:
def get_test_df(file_path: str = './data/test.csv'):
    test_df = pd.read_csv(filepath_or_buffer=file_path, sep=',', header=0)
    test_df['Date'] = pd.to_datetime(test_df['Date'], format='%d/%m/%Y')
    test_df.set_index(keys='Date', inplace=True)
    return test_df.sort_index(ascending=True)

In [None]:
def kaggle_submission(df_dict: Dict[str, pd.DataFrame], test_file: str, submission_file: str):
    orders = combine(df_dict)
    test_df = pd.read_csv(filepath_or_buffer=test_file, sep=',', header=0)
    # print(test_df.head())
    result_set = {}
    from statsmodels.tools.sm_exceptions import ConvergenceWarning
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', ConvergenceWarning)
        for gas in df_dict.keys():
            df = df_dict[gas]['Value']
            target = test_df[test_df['Gas'] == gas]
            target['Date'] = pd.to_datetime(target['Date'], format='%d/%m/%Y')
            target.set_index(keys='Date', inplace=True)
            target.sort_index(inplace=True)
            model = SARIMAX(df, order=orders[gas][0], seasonal_order=orders[gas][1]).fit()
            pred = model.forecast(target.index[-1]).rename('Value')
            result = pd.merge(left=pred, right=target, left_index=True, right_index=True, how='inner')
            print(result.shape)
            result_set[gas] = result[['id', 'Value']].set_index(keys='id')

    pred_df = pd.DataFrame(columns=result_set['CO'].columns)
    for gas in result_set.keys():
        pred_df = pd.concat(objs=(pred_df, result_set[gas]), axis=0)
    
    return pred_df.sort_index(ascending=True)

pred_df = kaggle_submission(df_partitioned, './data/test.csv', '')

In [None]:
# pred_df.to_csv('./out/base-ii.csv', sep=',')

In [None]:
from sklearn.metrics import mean_squared_error
def evaluate_predictions(pred_df: pd.DataFrame):
    # plt.plot(gg['CO']['train_true'])
    # plt.plot(gg['CO']['test_true'])
    # plt.plot(gg['CO']['train_pred'])
    # plt.plot(gg['CO']['test_pred'])
    # print(sum(list(map(lambda x: (gg[x]['train_rmse'] + gg[x]['test_rmse']) / 2, gg))) / 4)
    # for gas in gg.keys():
    #     print(gg[gas]['train_rmse'], gg[gas]['test_rmse'])
    target = get_test_df()
    df = target[target['Gas'] == 'CO']['id']
    comb = pd.merge(left=df, right=pred_df, left_on='id', right_index=True, how='inner')
    plt.plot(comb['Value'])
    plt.plot(df_partitioned['CO']['Value'])

evaluate_predictions(pred_df)

In [None]:
#differencing
# from statsmodels.tsa.statespace.tools import diff

# def get_diff(df):
#     return df['Value'], diff(df['Value'])

# first, second = get_diff(df_partitioned['CO'])

# plt.plot(first.index, first.values)
# plt.plot(second.index, second.values)

In [None]:
pacf_dict, pacf_summary = cf_summary(df_partitioned, kind='pacf', threshold=0.05)
print(pacf_dict, pacf_summary)

acf_dict, acf_summary = cf_summary(df_partitioned, kind='acf', threshold=0.05)
print(acf_dict, acf_summary)

In [None]:
# Loop thru gases, thresholds and freqs to get dataframe (sth like gridsearch but for time series)

In [None]:
# def arima(df: pd.DataFrame, order: Tuple[int, int, int], print_summary: bool = False):
#     model = time_series_analysis.arima.model.ARIMA(df['Value'], order=order)
#     res = model.fit()
#     if print_summary:
#         print(res.summary())
#     return res

# arima(df=df_partitioned['CO'], order=(1, 0, 1)).aic

In [None]:
# def grid_search(
#     df: pd.Series,
#     p_space: List[int] = [1],
#     d: int = 0,
#     q_space: List[int] = [1],
#     P_space: List[int] = None,
#     D: int = None,
#     Q_space: List[int] = None,
#     m_space: List[int] = [12],
#     seasonal: bool = False,
#     use_analogue: bool = True
# ):
#     from itertools import product
#     if seasonal:
#         if use_analogue:
#             P_space = p_space
#             D = d
#             Q_space = q_space
#         else:
#             P_space = p_space if P_space is None else P_space
#             D = d if D is None else D
#             Q_space = q_space if Q_space is None else Q_space
#         for m in m_space:
#             for order in product(p_space, [d], q_space):
#                 for seasonal_order in product(P_space, [D], Q_space):
#                     p, d, q = order
#                     P, D, Q = seasonal_order
#                     print((p, d, q), (P, D, Q, m))
#                     model = SARIMAX(endog=df, order=(p, d, q), seasonal_order=(P, D, Q, m)).fit()
#                     print(model.aic)

# grid_search(df_partitioned['CO']['Value'],
#     p_space=[32, 38],
#     q_space=[3, 5, 7],
#     m_space=[8],
#     P_space=[],
#     seasonal=True)

In [None]:
plt.plot(train)
plt.plot(test)
plt.plot(final.predict(start=0, end=int(history_length * 0.75)))
plt.plot(final.forecast(steps=target.shape[1] - int(history_length * 0.75) - 1))
mean_squared_error(train, final.predict(start=1, end=int(history_length * 0.75)), squared=False), mean_squared_error(test, final.forecast(steps=target.shape[1] - int(history_length * 0.75) - 1), squared=False)

In [None]:
final

In [None]:
import pickle

target = df_partitioned['CO']['Value']
history_length = target.shape[0]
train = target.iloc[:int(history_length * 0.75)]
test = target.iloc[int(history_length * 0.75):]

pickle.dump(obj=1, file=open('./tmp/stuff.p', 'wb'))
# final = SARIMAX(train, order=(1, 0, 1), seasonal_order=(5, 1, 0, 8)).fit()
print(final.summary())

In [None]:
# def test_facility(df_dict: Dict[str, pd.DataFrame], gas: str, order=Tuple[int, int, int]):
#     global pacf_dict, acf_dict
#     df = df_dict[gas]
#     fig, ax = plt.subplots(nrows=len(pacf_dict[gas]) * len(acf_dict[gas]), figsize=(len(pacf_dict[gas]) * len(acf_dict[gas]) * 5, 8))
#     i = 0
#     for p in pacf_dict[gas]:
#         for q in acf_dict[gas]:
#             model = arima(df=df, order=(p, order[1], q))
#             first_date = model.predict(start=0, end=0).index[0]
#             y_hat = model.predict(start=1, end=300)
#             y_hat.index = pd.date_range(start=first_date, periods=300, freq='D')
#             df_tmp = pd.DataFrame(data={
#                 'Value': y_hat,
#                 'Color': np.isin(y_hat.index, df.index)
#             })
#             hv = df_tmp[df_tmp['Color']]
#             donthv = df_tmp[~df_tmp['Color']]
#             # bridge = df_tmp[hv.index[-1]:donthv.index[0]]
#             sns.lineplot(x=hv.index, y=hv['Value'], color='blue', ax=ax[i])
#             # sns.lineplot(x=bridge.index, y=bridge['Value'], color='green', ax=ax)
#             sns.lineplot(x=donthv.index, y=donthv['Value'], color='green', ax=ax[i])
#             sns.lineplot(x=hv.index, y=df['Value'], color='orange', ax=ax[i])
#             aic = model.aic
#             print(p, q, aic)
#             i += 1

#     # print(r2_score(df[hv.index], hv['Value']))

# test_facility(df_partitioned, 'NMHC', (0, 0, 0))

In [None]:
# from pmdarima.arima import auto_arima

# def auto(df: pd.DataFrame):
#     arima_model = auto_arima(df['Value'], seasonal=True, m=8)
#     print(arima_model.summary())

# auto(df_partitioned['CO'])

In [None]:
# from statsmodels.tsa.holtwinters import Holt
# from sklearn.metrics import mean_squared_error

# model = Holt(df_partitioned['CO']['Value']).fit()
# y_hat_b4 = model.forecast(steps=100)

# y_true = df_partitioned['CO']['Value']
# model = ExponentialSmoothing(endog=df_partitioned['CO']['Value'], seasonal='mul', seasonal_periods=2).fit()
# y_hat = model.predict(start=0, end=391)
# y_pred = model.forecast(steps=50)
# plt.plot(y_true)
# plt.plot(y_hat)
# plt.plot(y_pred)
# model.aic, mean_squared_error(y_true, y_hat, squared=False)

In [None]:
# use supervised learning to check
# use varima to check correlation between T and RH

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

pca = PCA(n_components=3).fit(StandardScaler().fit_transform(df[['T', 'RH']]))
pca.components_
# df[['T', 'RH', 'Value']].corr()

# Part B > Clustering

## Import Exclusive Dependencies

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Import Data

In [None]:
df2 = pd.read_csv('./data/Mall_Customers.csv', index_col=0)
df2.rename(mapper={'Genre': 'Gender'}, axis=1, inplace=True)
df2.head()

In [None]:
['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance'].extend(df2.drop(columns='Gender').columns.tolist())

In [None]:
from typing import Union, List

def get_pca_results(df: pd.DataFrame, ignore_cols: Union[str, List[str]]):
    df_scaled = StandardScaler().fit_transform(X=df.drop(columns=ignore_cols))

    pca = PCA(n_components=df_scaled.shape[1]).fit(X=df_scaled)
    header = ['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']
    header.extend(df.drop(columns=ignore_cols).columns.tolist())
    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    expl_var = pca.explained_variance_ratio_
    cum_expl_var = pca.explained_variance_ratio_.cumsum()
    pca_results = pd.DataFrame(
        data=np.hstack((
            eigenvalues.reshape(-1, 1),
            expl_var.reshape(-1, 1),
            cum_expl_var.reshape(-1, 1),
            eigenvectors
        )),
        columns=header,
        index=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    df_transformed = pd.DataFrame(
        data=pca.transform(df_scaled),
        index=df.index,
        columns=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    return pca_results, df_transformed

pca_results, df2_transformed = get_pca_results(df=df2, ignore_cols='Gender')
pca_results

In [None]:
def scree_plot(df: pd.DataFrame, pca: pd.DataFrame):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(data=pca, x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot for PCA (df2)',
            ylim=(0, 1.4)
        )
        ax.annotate(text='As there is no elbow,\nno PC should be discarded', xy=(1.75, 1.2), ha='center')
        return ax

scree_plot(df2, pca_results)

In [None]:
def get_dist_score(df: pd.DataFrame, clusters: int):
    model = KMeans(n_clusters=clusters).fit(X=df[['Annual Income (k$)', 'Spending Score (1-100)']])
    centers = model.cluster_centers_
    y_hat = model.predict(df[['Annual Income (k$)', 'Spending Score (1-100)']])
    # plt.scatter(centers[:,0], centers[:,1])
    # sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=y_hat)
    df_tmp = pd.concat(objs=(df, pd.Series(data=y_hat, name='Class', index=df.index)), axis=1)
    means = []
    stds = []
    count = []
    for i, c in enumerate(centers):
        # plt.annotate(text=str(model.predict(c.reshape(-1, 2))), xy=(c[0], c[1])) # type: ignore
        class_i = df_tmp[df_tmp['Class'] == i][['Annual Income (k$)', 'Spending Score (1-100)']]
        distances = np.linalg.norm(class_i - centers[i, :], axis=1)
        # print(i, int(class_i.count(axis=0).mean()), distances.mean(), distances.std())
        # count.append(int(class_i.count(axis=0).mean()))
        means.append(distances.mean())
        # stds.append(distances.std())
    return np.std(means)

get_dist_score(df2, 5)

In [None]:
# type: ignore
from sklearn.metrics import silhouette_score
from more_itertools import powerset

def get_silhouette_score_plot(df: pd.DataFrame, ignore_cols: Union[str, List[str]] = None):
    ignore_cols = ignore_cols if ignore_cols is not None else [] 
    col_combs = list(filter(lambda x: len(x) > 1, list(powerset(df.drop(columns=ignore_cols).columns))))
    # n = len(cols)
    # n = 10
    # fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(10, 8))
    # c = 0
    fig, ax = plt.subplots(nrows=len(col_combs), figsize=(8, len(col_combs) * 5))
    for jj, col_comb in enumerate(col_combs):
        current_axis = ax[jj] if len(col_combs) > 1 else ax
        clusters = list(range(2, 10))
        sil = pd.DataFrame(data=clusters, columns=['Num'])
        dists = []
        silhoutte_scores = []
        col_list = [*col_comb]
        for i in clusters:
            model = KMeans(n_clusters=i).fit(X=df[col_list])
            y_hat = model.predict(X=df[col_list])
            dists.append(get_dist_score(df, i))
            silhoutte_scores.append(silhouette_score(X=df[col_list], labels=y_hat))
        col_str = ', '.join(col_list)
        sil = pd.concat(objs=(sil, pd.Series(
            name=col_str,
            data=silhoutte_scores
        )), axis=1)
        # print(sil.melt(id_vars='Num'))
        sns.lineplot(data=sil.melt(id_vars='Num'), x='Num', y='value', hue='variable', ax=current_axis)
        # print(np.array(dists)[:,0])
        # sns.lineplot(x=clusters, y=np.array(dists)[:, 0], ax=current_axis.twinx(), color='tab:orange')
        sns.lineplot(x=clusters, y=dists, ax=current_axis.twinx(), color='tab:green')
        # sns.lineplot(x=clusters, y=np.array(dists)[:, 2], ax=current_axis.twinx(), color='tab:grey')
        # ax.legend(bbox_to_anchor=(2, 1))

get_silhouette_score_plot(df=df2, ignore_cols='Gender')

In [None]:
cluster_params = [4, 5]
fig, ax = plt.subplots(ncols=2, figsize=(10, 8))
for i, cl in enumerate(cluster_params):
    model = KMeans(n_clusters=cl).fit(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    y_hat = model.predict(df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    sns.scatterplot(data=df2, x='Annual Income (k$)', y='Spending Score (1-100)', hue=y_hat, ax=ax[i])
    print(f'Silhouette Score ({cl}):', silhouette_score(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']], labels=model.labels_, metric='euclidean'))

In [None]:
import plotly.express as px
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, AffinityPropagation

for mo in [KMeans(n_clusters=6), DBSCAN(eps=15, min_samples=15), OPTICS(max_eps=18)]:
    colrs = mo.fit_predict(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']])
    fig = px.scatter_3d(data_frame=df2, x='Annual Income (k$)', y='Age', z='Spending Score (1-100)', color=colrs, title=type(mo).__name__ + ' ' + str(round(silhouette_score(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']], labels=colrs), 2)), color_continuous_scale=px.colors.sequential.Rainbow)
    fig.show()