# Part A > Time Series Regression

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Time Series Dependency
import statsmodels as sm

# 
from sklearn.cluster import KMeans

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

## Import Data

In [None]:
df = pd.read_csv('./data/train.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values('Date')

In [None]:
df['T'][df['T'] < 0]

In [None]:
gbo = df.set_index(keys='Date').resample(rule='2M').mean()

fig, ax = plt.subplots()
plt.plot(gbo.index, gbo['T'], label='T')
plt.plot(gbo.index, gbo['RH'], label='RH')

ax2 = ax.twinx()
plt.plot(gbo.index, gbo['Value'], label='Value', color='green')
ax.legend()
ax2.legend()

In [None]:
def plot_(df):
    # fig, ax = plt.subplots(nrows=2, ncols=2)
    gb = df.groupby(by='Gas', as_index=False)
    mean_ = gb.mean()
    mean_['Type'] = ['Mean'] * mean_.shape[0]
    types = ['Mean', 'Median']
    comb_df = pd.DataFrame()
    for i, frame in enumerate([gb.mean(), gb.median()]):
        tmp_df = frame
        tmp_df['Type'] = types[i]
        comb_df = pd.concat(objs=(comb_df, tmp_df), axis=0)
    # print(pd.merge(left=gb.mean(), right=gb.median(), on='Gas'))
    print(comb_df)
    sns.barplot(data=comb_df, x='Gas', y='Value', hue='Type', palette='rainbow')
plot_(df)

In [None]:
def get_stationarity_of_columns(df: pd.DataFrame, significance_level: float = 0.01):
    for col in df.drop(columns='Date').columns:
        if df[col].dtype.kind in 'biufc':
            p_value = round(adfuller(df.set_index(keys='Date')[col])[1], 5)
            if p_value < significance_level:
                print(f'{col}:\tStationary ({p_value})'.expandtabs(tabsize=10))
            else:
                print(f'{col}:\tNon-Stationary ({p_value})'.expandtabs(tabsize=10))
            seasonal_decompose(df.copy().set_index(keys='Date')[col].resample(rule='D').sum()).plot()

get_stationarity_of_columns(df)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['Year'] = pd.DatetimeIndex(X['Date']).year
        X['Month'] = pd.DatetimeIndex(X['Date']).month
        X['Day'] = pd.DatetimeIndex(X['Date']).day
        X['Quarter'] = pd.DatetimeIndex(X['Date']).quarter
        return X

In [None]:
CustomTransformer().fit_transform(df)

In [None]:
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Day'] = pd.DatetimeIndex(df['Date']).day
df['Quarter'] = pd.DatetimeIndex(df['Date']).quarter

In [None]:
df.groupby(by=['Year', 'Quarter']).count()

In [None]:
sns.scatterplot(data=df, x='Day', y='Value', hue='Month')

In [None]:
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
model = BaggingRegressor()
df['Gas'] = LabelEncoder().fit_transform(df['Gas'])

model.fit(X=df.drop(columns=['Value', 'Date', 'Quarter']), y=df['Value'])
pred = model.predict(X=df.drop(columns=['Value', 'Date', 'Quarter']))

from sklearn.metrics import r2_score
r2_score(df['Value'], pred)

In [None]:
test = pd.read_csv('./data/test.csv', index_col='id')
CustomTransformer().transform(test)

In [None]:
df

# Part B > Clustering

## Import Exclusive Dependencies

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Import Data

In [None]:
df2 = pd.read_csv('./data/Mall_Customers.csv', index_col=0)
df2.rename(mapper={'Genre': 'Gender'}, axis=1, inplace=True)
df2.head()

In [None]:
['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance'].extend(df2.drop(columns='Gender').columns.tolist())

In [None]:
from typing import Union, List

def get_pca_results(df: pd.DataFrame, ignore_cols: Union[str, List[str]]):
    df_scaled = StandardScaler().fit_transform(X=df.drop(columns=ignore_cols))

    pca = PCA(n_components=df_scaled.shape[1]).fit(X=df_scaled)
    header = ['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']
    header.extend(df.drop(columns=ignore_cols).columns.tolist())
    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    expl_var = pca.explained_variance_ratio_
    cum_expl_var = pca.explained_variance_ratio_.cumsum()
    pca_results = pd.DataFrame(
        data=np.hstack((
            eigenvalues.reshape(-1, 1),
            expl_var.reshape(-1, 1),
            cum_expl_var.reshape(-1, 1),
            eigenvectors
        )),
        columns=header,
        index=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    df_transformed = pd.DataFrame(
        data=pca.transform(df_scaled),
        index=df.index,
        columns=[f'PC {i + 1}' for i in range(df_scaled.shape[1])]
    )

    return pca_results, df_transformed

pca_results, df2_transformed = get_pca_results(df=df2, ignore_cols='Gender')
pca_results

In [None]:
def scree_plot(df: pd.DataFrame, pca: pd.DataFrame):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(data=pca, x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot for PCA (df2)',
            ylim=(0, 1.4)
        )
        ax.annotate(text='As there is no elbow,\nno PC should be discarded', xy=(1.75, 1.2), ha='center')
        return ax

scree_plot(df2, pca_results)

In [None]:
# type: ignore
from sklearn.metrics import silhouette_score
from itertools import combinations
from more_itertools import powerset

def get_silhouette_score_plot(df: pd.DataFrame, ignore_cols: Union[str, List[str]] = None):
    ignore_cols = ignore_cols if ignore_cols is not None else [] 
    col_combs = filter(lambda x: len(x) == 3, list(powerset(df.drop(columns=ignore_cols).columns)))
    
    # n = len(cols)
    # n = 10
    # fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(10, 8))
    # c = 0
    fig, ax = plt.subplots()

    sil = pd.DataFrame(data=list(range(2, 10)), columns=['Num'])
    for col_comb in col_combs:
        silhoutte_scores = []
        col_list = [*col_comb]
        for i in range(2, 10):
            model = KMeans(n_clusters=i).fit(X=df[col_list])
            y_hat = model.predict(X=df[col_list])
            silhoutte_scores.append(silhouette_score(X=df[col_list], labels=y_hat))
        col_str = ', '.join(col_list)

        sil = pd.concat(objs=(sil, pd.Series(
            name=col_str,
            data=silhoutte_scores
        )), axis=1)

    # print(sil.melt(id_vars='Num'))
    sns.lineplot(data=sil.melt(id_vars='Num'), x='Num', y='value', hue='variable', ax=ax)
    
    ax.legend(bbox_to_anchor=(2, 1))

get_silhouette_score_plot(df=df2, ignore_cols='Gender')

In [None]:
cluster_params = [4, 5]
fig, ax = plt.subplots(ncols=2, figsize=(10, 8))
for i, cl in enumerate(cluster_params):
    model = KMeans(n_clusters=cl).fit(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    y_hat = model.predict(df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    sns.scatterplot(data=df2, x='Annual Income (k$)', y='Spending Score (1-100)', hue=y_hat, ax=ax[i])
    print(f'Silhouette Score ({cl}):', silhouette_score(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']], labels=model.labels_, metric='euclidean'))

In [None]:
import plotly.express as px
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, AffinityPropagation

for mo in [KMeans(n_clusters=6), DBSCAN(eps=15, min_samples=15), OPTICS(max_eps=18)]:
    colrs = mo.fit_predict(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']])
    fig = px.scatter_3d(data_frame=df2, x='Annual Income (k$)', y='Age', z='Spending Score (1-100)', color=colrs, title=type(mo).__name__ + ' ' + str(round(silhouette_score(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']], labels=colrs), 2)), color_continuous_scale=px.colors.sequential.Rainbow)
    fig.show()