# Part A > Time Series Regression

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('./data/train.csv')
df['Date'] = pd.DatetimeIndex(data=df['Date'])
df

In [None]:
def plot_(df):
    # fig, ax = plt.subplots(nrows=2, ncols=2)
    gb = df.groupby(by='Gas', as_index=False)
    mean_ = gb.mean()
    mean_['Type'] = ['Mean'] * mean_.shape[0]
    types = ['Mean', 'Median']
    comb_df = pd.DataFrame()
    for i, frame in enumerate([gb.mean(), gb.median()]):
        tmp_df = frame
        tmp_df['Type'] = types[i]
        comb_df = pd.concat(objs=(comb_df, tmp_df), axis=0)
    # print(pd.merge(left=gb.mean(), right=gb.median(), on='Gas'))
    print(comb_df)
    sns.barplot(data=comb_df, x='Gas', y='Value', hue='Type', palette='rainbow')
plot_(df)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['Year'] = pd.DatetimeIndex(X['Date']).year
        X['Month'] = pd.DatetimeIndex(X['Date']).month
        X['Day'] = pd.DatetimeIndex(X['Date']).day
        X['Quarter'] = pd.DatetimeIndex(X['Date']).quarter
        return X

In [None]:
CustomTransformer().fit_transform(df)

In [None]:
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Day'] = pd.DatetimeIndex(df['Date']).day
df['Quarter'] = pd.DatetimeIndex(df['Date']).quarter

In [None]:
df.groupby(by=['Year', 'Quarter']).count()

In [None]:
sns.scatterplot(data=df, x='Day', y='Value', hue='Month')

In [None]:
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
model = BaggingRegressor()
df['Gas'] = LabelEncoder().fit_transform(df['Gas'])

model.fit(X=df.drop(columns=['Value', 'Date', 'Quarter']), y=df['Value'])
pred = model.predict(X=df.drop(columns=['Value', 'Date', 'Quarter']))

from sklearn.metrics import r2_score
r2_score(df['Value'], pred)

In [None]:
test = pd.read_csv('./data/test.csv', index_col='id')
CustomTransformer().transform(test)

In [None]:
df

# Part B > Clustering

In [None]:
df2 = pd.read_csv('./data/Mall_Customers.csv', index_col=0)
df2.rename(mapper={'Genre': 'Gender'}, axis=1, inplace=True)
df2.head()

In [None]:
from sklearn.metrics import silhouette_score
cols = df2.columns.drop(['Genre', 'Male'])
n = len(cols)
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(10, 8))
c = 0
for i in range(n):
    for j in range(n):
        if i != j:
            sns.scatterplot(data=df2, x=cols[i], y=cols[j], hue=df2['Male'], ax=ax[c % 2, c % 3])
            c += 1

In [None]:
cluster_params = [4, 5]
fig, ax = plt.subplots(ncols=2, figsize=(10, 8))
for i, cl in enumerate(cluster_params):
    model = KMeans(n_clusters=cl).fit(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    y_hat = model.predict(df2[['Annual Income (k$)', 'Spending Score (1-100)']])
    sns.scatterplot(data=df2, x='Annual Income (k$)', y='Spending Score (1-100)', hue=y_hat, ax=ax[i])
    print(f'Silhouette Score ({cl}):', silhouette_score(X=df2[['Annual Income (k$)', 'Spending Score (1-100)']], labels=model.labels_, metric='euclidean'))

In [None]:
import plotly.express as px
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, AffinityPropagation

for mo in [KMeans(n_clusters=6), DBSCAN(eps=15, min_samples=15), OPTICS(max_eps=18)]:
    colrs = mo.fit_predict(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']])
    print(silhouette_score(X=df2[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']], labels=colrs))
    fig = px.scatter_3d(data_frame=df2, x='Annual Income (k$)', y='Age', z='Spending Score (1-100)', color=colrs, title=type(mo).__name__)
    fig.show()