In [None]:
from IPython import get_ipython

ipython = get_ipython()

exec_no = ipython.execution_count
exec_no

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random 
import pickle
import os

from IPython.utils import io

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.base import clone

#!pip install -U git+https://github.com/joaopfonseca/SOMPY.git
import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView


%matplotlib inline
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'


In [None]:
if False: 
    with io.capture_output() as captured:
        %run get_wd.py

    s = captured.stdout # prints stdout from your script

In [None]:
if exec_no == 1: 
    s = os.getcwd()
    os.chdir(os.path.dirname(s))
    exec_no += 1
print(os.getcwd())

In [None]:
# definitions 

#os.chdir('/Users/dp/Nova/OneDrive - NOVAIMS/1stSemester/DM/DMProject')
computed_data_path = 'computed_data/'
explorations_data_path = 'explorations/'

paths = [computed_data_path, explorations_data_path]
for path in paths:
    if not os.path.exists(path): 
        os.makedirs(path)

In [None]:
#run promo_history.ipynb


In [None]:
# save results

with open(os.path.join(computed_data_path, 'history_feat_raw.pickle'), 'rb') as f: 
    history_agg = pickle.load(f)
    
with open(os.path.join(computed_data_path, 'history_feat_multi_out.pickle'), 'rb') as f: 
    history_agg_out_multi = pickle.load(f)
    
with open(os.path.join(computed_data_path, 'history_feat_multi_clean.pickle'), 'rb') as f: 
    history_agg_multi = pickle.load(f)

In [None]:
history_agg_multi

In [None]:
history_agg_out_multi

# Preparation

## setup

In [None]:
# define cols

## cols for clsutering 
cl_feat_hist = history_agg_multi.columns.to_list()
print('\ncl_feat_hist: \n',cl_feat_hist)

metric_features = cl_feat_hist

## cols for descirption 
#desc_feat_hist = history_agg.loc[~history_agg.columns.isin(cl_feat_hist),:].columns.to_list()
desc_feat_hist = history_agg.columns[~history_agg.columns.isin(cl_feat_hist)].to_list()
print('\ndesc_feat_hist: \n',desc_feat_hist)



In [None]:
# define rows

cl_donor_ids = history_agg_multi.index.to_list()


In [None]:
history_agg[history_agg.index.isin(cl_donor_ids)]

In [None]:
df_hist = history_agg_multi[cl_feat_hist]
df_hist


## Scale

In [None]:
def scale_df(df, scale='minmax'): 
    if scale == 'minmax': 
        scaler = MinMaxScaler()
    df_normal = scaler.fit_transform(df)
    df_normal = pd.DataFrame(df_normal, columns=df.columns, index=df.index)
    return df_normal

    
df_hist_normal = scale_df(df=df_hist, scale='minmax')

## merge df

In [None]:
df_normal = df_hist_normal.copy()

## multivariate outliers

# Clustering

In [None]:
# Applying the right clustering (algorithm and number of clusters) for each perspective
kmeans_prod = KMeans(
    n_clusters=10,
    init='k-means++',
    n_init=20,
    random_state=1
)
hist_labels = kmeans_prod.fit_predict(df_hist_normal)

df_normal['hist_labels'] = hist_labels


In [None]:
df_hist_normal.hist_labels.value_counts()

## Testing on K-means and Hierarchical clustering
Based on (1) our previous tests and (2) the context of this problem, the optimal number of clusters is expected to be between 3 and 7.

In [None]:
def get_ss(df):
    """Computes the sum of squares for all variables given a dataset
    """
    ss = np.sum(df.var() * (df.count() - 1))
    return ss  # return sum of sum of squares of each df variable

def r2(df, labels):
    sst = get_ss(df)
    ssw = np.sum(df.groupby(labels).apply(get_ss))
    return 1 - ssw/sst
    
def get_r2_scores(df, clusterer, min_k=2, max_k=10):
    """
    Loop over different values of k. To be used with sklearn clusterers.
    """
    r2_clust = {}
    for n in range(min_k, max_k):
        clust = clone(clusterer).set_params(n_clusters=n)
        labels = clust.fit_predict(df)
        r2_clust[n] = r2(df, labels)
    return r2_clust


# Set up the clusterers
kmeans = KMeans(
    init='k-means++',
    n_init=20,
    random_state=1
)

hierarchical = AgglomerativeClustering(
    affinity='euclidean'
)

### Finding the optimal clusterer on promotion history variables

#### R2

In [None]:
# Obtaining the R² scores for each cluster solution on promotion history variables
find_hist_k = False
if find_hist_k:
    r2_scores = {}
    
    print('kmeans')
    r2_scores['kmeans'] = get_r2_scores(df_hist_normal, kmeans)

    if False: 
        for linkage in ['ward']: # 'complete', 'average', 'single',
            print(linkage)
            r2_scores[linkage] = get_r2_scores(
                df_hist_normal, hierarchical.set_params(linkage=linkage)
            )

    pd.DataFrame(r2_scores)

In [None]:
# Visualizing the R² scores for each cluster solution on demographic variables
if find_hist_k:
    pd.DataFrame(r2_scores).plot.line(figsize=(10,7))

    plt.title("Demographic Variables:\nR² plot for various clustering methods\n", fontsize=21)
    plt.legend(title="Cluster methods", title_fontsize=11)
    plt.xlabel("Number of clusters", fontsize=13)
    plt.ylabel("R² metric", fontsize=13)
    plt.show()

#### Intertia

In [None]:
inertia = False

if inertia:
    range_clusters = range(1, 11)
    inertia = []
    for n_clus in range_clusters:  # iterate over desired ncluster range
        print(n_clus)
        kmclust = KMeans(n_clusters=n_clus, init='k-means++', n_init=20, random_state=1)
        kmclust.fit(df_normal[cl_feat_hist])
        inertia.append(kmclust.inertia_)  # save the inertia of the given cluster solution

In [None]:
if inertia:
    plt.plot(inertia)
    plt.show()

#### SOM

In [None]:
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)

# Notice that the SOM did not converge - We're under a time constraint for this class
sm = sompy.SOMFactory().build(
    df[metric_features].values, 
    mapsize=(50, 50), 
    initialization='random',
    neighborhood='gaussian',
    training='batch',
    lattice='hexa',
    component_names=metric_features
)
sm.train(n_job=-1, verbose='info', train_rough_len=100, train_finetune_len=100)

In [None]:
# Coordinates of the units in the input space
sm.get_node_vectors()

In [None]:
# Component planes on the 50x50 grid
sns.set()
view2D = View2D(12,12,"", text_size=10)
view2D.show(sm, col_sz=3, what='codebook')
plt.subplots_adjust(top=0.90)
plt.suptitle("Component Planes", fontsize=20)
plt.show()

In [None]:
# U-matrix of the 50x50 grid
u = sompy.umatrix.UMatrixView(12, 12, 'umatrix', show_axis=True, text_size=8, show_text=True)

UMAT = u.show(
    sm, 
    distance2=1, 
    row_normalized=False, 
    show_data=False, 
    contooor=True # Visualize isomorphic curves
)

## Cluster visualization using t-SNE

In [None]:
label_name = 'hist_labels' # 'merged_labels'

tsne_df = df_normal.sample(frac=.1, axis=0, random_state=1)
tsne_feat = tsne_df[metric_features]
tsne_feat

tsne_c = tsne_df[label_name]
tsne_c

In [None]:
# This is step can be quite time consuming

two_dim = TSNE(random_state=1, n_jobs=-1).fit_transform(tsne_feat)

In [None]:
# t-SNE visualization
pd.DataFrame(two_dim).plot.scatter(x=0, y=1, c=tsne_c, colormap='tab10', figsize=(15,10))
plt.show()

In [None]:
# t-SNE visualization
pd.DataFrame(two_dim).plot.scatter(x=0, y=1, c=tsne_c, colormap='tab10', figsize=(15,10))
plt.show()