# Part A > Time Series Regression

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Time Series Dependency
import statsmodels.tsa as time_series_analysis
from pmdarima.arima import arima

# 
from sklearn.cluster import KMeans

from typing import Union, List, Tuple
import warnings

In [None]:
warnings.filterwarnings(action='ignore')

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

## Import Data

In [None]:
df = pd.read_csv('./data/train.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(keys='Date', inplace=True)
df.sort_index(inplace=True)
df

In [None]:
CO, NMHC, NOx, O3 = [df[df['Gas'] == gas] for gas in df['Gas'].unique()]

CO.groupby(by=[pd.to_datetime(CO.index).year, pd.to_datetime(CO.index).quarter]).count()

In [None]:
def get_data_grouped_by_date(df: pd.DataFrame, freq: Union[str, List[str]]):
    def get_plot(df: pd.DataFrame, fr: str, ax):
        df_tmp = df.copy()
        gbo = df_tmp.resample(rule=fr).mean()

        ax.plot(gbo.index, gbo['T'], label='T')
        ax.plot(gbo.index, gbo['RH'], label='RH')
        ax.plot([], [], color='g', label='Value')

        ax2 = ax.twinx()
        ax2.plot(gbo.index, gbo['Value'], color='green')
        return ax
    
    if type(freq) is str or len(freq) == 1:
        fig, ax = plt.subplots()
        get_plot(df, freq if type(freq) is str else freq[0], ax) #type: ignore
    else:
        fig, ax = plt.subplots(nrows=len(freq), figsize=(8, 5))
        for i, fr in enumerate(freq):
            get_plot(df, fr, ax[i])
        ax[-1].legend(bbox_to_anchor=(1.3, 3.3))

get_data_grouped_by_date(df=NMHC, freq=['M', '2M', 'Q', 'Y'])

In [None]:
def get_gas_distributions(df: pd.DataFrame):
    counts = df.groupby(by='Gas').count().median(axis=1).astype(int)
    sns.barplot(x=counts.index, y=counts, palette='deep')

get_gas_distributions(df)

In [None]:
def plot_(df):
    # fig, ax = plt.subplots(nrows=2, ncols=2)
    gb = df.groupby(by='Gas', as_index=False)
    mean_ = gb.mean()
    mean_['Type'] = ['Mean'] * mean_.shape[0]
    types = ['Mean', 'Median']
    comb_df = pd.DataFrame()
    for i, frame in enumerate([gb.mean(), gb.median()]):
        tmp_df = frame
        tmp_df['Type'] = types[i]
        comb_df = pd.concat(objs=(comb_df, tmp_df), axis=0)
    # print(comb_df)
    sns.barplot(data=comb_df, x='Gas', y='Value', hue='Type', palette='rainbow')
plot_(df)

In [None]:
def get_stationarity_summary(dfs: Tuple[pd.DataFrame], plot: bool = False):
    def get_stationarity_of_columns(df: pd.DataFrame, significance_level: float = 0.05):
        df_tmp = df.copy()
        stationarities = []
        numerical_cols = []
        for col in df.columns:
            if df[col].dtype.kind in 'biufc':
                numerical_cols.append(col)
                p_value = round(adfuller(df_tmp.resample(rule='M').mean()[col])[1], 5)
                if p_value < significance_level:
                    stationarities.append(True)
                    print(f'{col}:\tStationary ({p_value})'.expandtabs(tabsize=10))
                else:
                    stationarities.append(False)
                    print(f'{col}:\tNon-Stationary ({p_value})'.expandtabs(tabsize=10))
                if plot:
                    seasonal_decompose(df_tmp.resample(rule='M')[col].mean()).plot()
        return numerical_cols, stationarities

    serieses = []
    for gas in dfs:
        print(gas['Gas'][0])
        c, st = get_stationarity_of_columns(gas)
        serieses.append(pd.Series(name=gas['Gas'][0], index=c, data=st))
        print()
    return pd.concat(objs=serieses, axis=1).T

stats = get_stationarity_summary((CO, NMHC, NOx, O3))

In [None]:
def groupby_date(df: pd.DataFrame, freq: str):
    df_tmp = df.copy()
    gbo = df_tmp.resample(rule=freq).mean()
    return gbo

groupby_date(CO, 'M')

In [None]:
from statsmodels.tsa.stattools import pacf, acf
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

def cf_summary(dfs: Tuple[pd.DataFrame], kind: Union['pacf', 'acf'], resample: str = 'M', threshold: float = 0.01, plot: bool = False):
    corr_fn = pacf if kind == 'pacf' else acf
    plot_fn = plot_pacf if kind == 'pacf' else plot_acf
    gases = []
    best_ts = []
    for df in dfs:
        current_gas = df['Gas'][0]
        gases.append(current_gas)
        if resample is not None:
            gbo = groupby_date(df, resample)
        else:
            gbo = df.copy()
        lag_space = int(gbo.count()[0] / 2) - 1
        corr_scores, conf_intvs = corr_fn(gbo['Value'], nlags=lag_space, alpha=threshold)
        lower_conf_bound = conf_intvs[:, 0] - corr_scores
        upper_conf_bound = conf_intvs[:, 1] - corr_scores
        t_values = np.where((corr_scores < lower_conf_bound) | (corr_scores > upper_conf_bound))[0][1:]
        best_t = t_values[np.argmax((np.abs(corr_scores[t_values])))] if len(t_values) > 0 else 0
        best_ts.append(best_t)
        if plot:
            plot_fn(gbo['Value'], lags=lag_space, alpha=threshold).get_axes()[0].set_title(f'{current_gas}: possible t-values: {tuple(t_values)} Best t: {best_t}')
    return pd.DataFrame(data={
        'Gas': gases,
        't': best_ts
    }).set_index(keys='Gas')

In [None]:
pacf_summary = cf_summary((CO, NMHC, NOx, O3), kind='pacf', threshold=0.01)
print(pacf_summary)

acf_summary = cf_summary((CO, NMHC, NOx, O3), kind='acf', threshold=0.01)
print(acf_summary)

In [None]:
from statsmodels.tsa.statespace.tools import diff

def get_diff(df):
    return groupby_date(df, 'M')['Value'], diff(groupby_date(df, 'M')['Value'])

first, second = get_diff(NOx)

plt.plot(first.index, first.values)
plt.plot(second.index, second.values)

In [None]:
trial1 = pd.DataFrame(data={'Value': second, 'Gas': ['NOx' for i in range(second.shape[0])]})
trial2 = pd.DataFrame(data={'Value': first, 'Gas': ['NOx' for i in range(first.shape[0])]})

cf_summary((trial1, trial2), kind='pacf', threshold=0.05, resample=None)

In [None]:
# Loop thru gases, thresholds and freqs to get dataframe (sth like gridsearch but for time series)

In [None]:
def arima(df: pd.DataFrame, freq: str, order: Tuple[int, int, int], print_summary: bool = False):
    return_df = groupby_date(df, freq)['Value']
    model = time_series_analysis.arima_model.ARIMA(return_df, order=order)
    res = model.fit()
    if print_summary:
        print(res.summary())
    return return_df, res

arima(df=NOx, freq='M', order=(1, 0, 1))[0]

In [None]:
def test_facility():
    og, model = arima(df=NOx, freq='M', order=(2, 1, 0))
    y_hat = model.predict(start=2, end=20)
    tdf = pd.DataFrame(data={
        'Value': y_hat,
        'Color': np.isin(y_hat.index, NOx.index)
    })
    hv = tdf[tdf['Color']]
    donthv = tdf[~tdf['Color']]
    bridge = tdf[hv.index[-1]:donthv.index[0]]
    ax = sns.lineplot(x=hv.index, y=hv['Value'], color='blue')
    sns.lineplot(x=bridge.index, y=bridge['Value'], color='green', ax=ax)
    sns.lineplot(x=donthv.index, y=donthv['Value'], color='green', ax=ax)
    sns.lineplot(x=hv.index, y=og[hv.index], color='orange', ax=ax)

test_facility()

In [None]:
from pmdarima.arima import auto_arima

def auto(df: pd.DataFrame, freq: str):
    df_tmp = df.copy()
    # gbo = df_tmp.resample(rule=freq).mean()
    arima_model = auto_arima(df_tmp['Value'], seasonal=False)
    print(arima_model.summary())

auto(O3, 'M')

In [None]:
# use supervised learning to check
# use varima to check correlation between T and RH

# Part B > Clustering

## Import Exclusive Dependencies

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Import Data

In [None]:
df2 = pd.read_csv('./data/Mall_Customers.csv', index_col=0)
df2.rename(mapper={'Genre': 'Gender'}, axis=1, inplace=True)
df2.head()

In [None]:
['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance'].extend(df2.drop(columns='Gender').columns.tolist())

In [None]:
from typing import Union, List

def get_pca_results(df: pd.DataFrame, ignore_cols: Union[str, List[str]]):
    df_scaled = StandardScaler().fit_transform(X=df.drop(columns=ignore_cols))

    pca = PCA(n_components=df_scaled.shape[1]).fit(X=df_scaled)
    header = ['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']
    header.extend(df.drop(columns=ignore_cols).columns.tolist())
    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    expl_var = pca.explained_variance_ratio_
    cum_expl_var = pca.explained_variance_ratio_.cumsum()
    pca_results = pd.DataFrame(
        data=np.hstack((
            eigenvalues.reshape(-1, 1),
            expl_var.reshape(-1, 1),
            cum_expl_var.reshape(-1, 1),
            eigenvectors
        )),
        columns=header,
        index=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    df_transformed = pd.DataFrame(
        data=pca.transform(df_scaled),
        index=df.index,
        columns=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    return pca_results, df_transformed

pca_results, df2_transformed = get_pca_results(df=df2, ignore_cols='Gender')
pca_results

In [None]:
def scree_plot(df: pd.DataFrame, pca: pd.DataFrame):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(data=pca, x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot for PCA (df2)',
            ylim=(0, 1.4)
        )
        ax.annotate(text='As there is no elbow,\nno PC should be discarded', xy=(1.75, 1.2), ha='center')
        return ax

scree_plot(df2, pca_results)

In [None]:
def get_dist_score(df: pd.DataFrame, clusters: int):
    model = KMeans(n_clusters=clusters).fit(X=df[['Annual Income (k$)', 'Spending Score (1-100)']])
    centers = model.cluster_centers_
    y_hat = model.predict(df[['Annual Income (k$)', 'Spending Score (1-100)']])
    # plt.scatter(centers[:,0], centers[:,1])
    # sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=y_hat)
    df_tmp = pd.concat(objs=(df, pd.Series(data=y_hat, name='Class', index=df.index)), axis=1)
    means = []
    stds = []
    count = []
    for i, c in enumerate(centers):
        # plt.annotate(text=str(model.predict(c.reshape(-1, 2))), xy=(c[0], c[1])) # type: ignore
        class_i = df_tmp[df_tmp['Class'] == i][['Annual Income (k$)', 'Spending Score (1-100)']]
        distances = np.linalg.norm(class_i - centers[i, :], axis=1)
        # print(i, int(class_i.count(axis=0).mean()), distances.mean(), distances.std())
        # count.append(int(class_i.count(axis=0).mean()))
        means.append(distances.mean())
        # stds.append(distances.std())
    return np.std(means)

get_dist_score(df2, 5)

In [None]:
# type: ignore
from sklearn.metrics import silhouette_score
from more_itertools import powerset

def get_silhouette_score_plot(df: pd.DataFrame, ignore_cols: Union[str, List[str]] = None):
    ignore_cols = ignore_cols if ignore_cols is not None else [] 
    col_combs = list(filter(lambda x: len(x) > 1, list(powerset(df.drop(columns=ignore_cols).columns))))
    # n = len(cols)
    # n = 10
    # fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(10, 8))
    # c = 0
    fig, ax = plt.subplots(nrows=len(col_combs), figsize=(8, len(col_combs) * 5))
    for jj, col_comb in enumerate(col_combs):
        current_axis = ax[jj] if len(col_combs) > 1 else ax
        clusters = list(range(2, 10))
        sil = pd.DataFrame(data=clusters, columns=['Num'])
        dists = []
        silhoutte_scores = []
        col_list = [*col_comb]
        for i in clusters:
            model = KMeans(n_clusters=i).fit(X=df[col_list])
            y_hat = model.predict(X=df[col_list])
            dists.append(get_dist_score(df, i))
            silhoutte_scores.append(silhouette_score(X=df[col_list], labels=y_hat))
        col_str = ', '.join(col_list)
        sil = pd.concat(objs=(sil, pd.Series(
            name=col_str,
            data=silhoutte_scores
        )), axis=1)
        # print(sil.melt(id_vars='Num'))
        sns.lineplot(data=sil.melt(id_vars='Num'), x='Num', y='value', hue='variable', ax=current_axis)
        # print(np.array(dists)[:,0])
        # sns.lineplot(x=clusters, y=np.array(dists)[:, 0], ax=current_axis.twinx(), color='tab:orange')
        sns.lineplot(x=clusters, y=dists, ax=current_axis.twinx(), color='tab:green')
        # sns.lineplot(x=clusters, y=np.array(dists)[:, 2], ax=current_axis.twinx(), color='tab:grey')
        # ax.legend(bbox_to_anchor=(2, 1))

get_silhouette_score_plot(df=df2, ignore_cols='Gender')

In [None]:
cluster_params = [4, 5]
fig, ax = plt.subplots(ncols=2, figsize=(10, 8))
for i, cl in enumerate(cluster_params):
    model = KMeans(n_clusters=cl).fit(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    y_hat = model.predict(df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    sns.scatterplot(data=df2, x='Annual Income (k$)', y='Spending Score (1-100)', hue=y_hat, ax=ax[i])
    print(f'Silhouette Score ({cl}):', silhouette_score(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']], labels=model.labels_, metric='euclidean'))

In [None]:
import plotly.express as px
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, AffinityPropagation

for mo in [KMeans(n_clusters=6), DBSCAN(eps=15, min_samples=15), OPTICS(max_eps=18)]:
    colrs = mo.fit_predict(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']])
    fig = px.scatter_3d(data_frame=df2, x='Annual Income (k$)', y='Age', z='Spending Score (1-100)', color=colrs, title=type(mo).__name__ + ' ' + str(round(silhouette_score(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']], labels=colrs), 2)), color_continuous_scale=px.colors.sequential.Rainbow)
    fig.show()