In [None]:
from IPython import get_ipython

ipython = get_ipython()

exec_no = ipython.execution_count
exec_no

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random 
import pickle
import os

from IPython.utils import io

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from scipy.cluster.hierarchy import dendrogram


#!pip install -U git+https://github.com/joaopfonseca/SOMPY.git
import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView



#%pip install boruta
from boruta import BorutaPy


%matplotlib inline
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'


In [None]:
if False: 
    with io.capture_output() as captured:
        %run get_wd.py

    s = captured.stdout # prints stdout from your script

In [None]:
if exec_no == 1: 
    s = os.getcwd()
    os.chdir(os.path.dirname(s))
    exec_no += 1
print(os.getcwd())

In [None]:
# definitions 

#os.chdir('/Users/dp/Nova/OneDrive - NOVAIMS/1stSemester/DM/DMProject')
computed_data_path = 'computed_data/'
explorations_data_path = 'explorations/'

paths = [computed_data_path, explorations_data_path]
for path in paths:
    if not os.path.exists(path): 
        os.makedirs(path)

In [None]:
#run promo_history.ipynb


In [None]:
os.listdir(computed_data_path)

In [None]:
# load preparation results
extended_data = False

if extended_data: 
    with open(os.path.join(computed_data_path, 'history_feat_raw.pickle'), 'rb') as f: 
        history_agg = pickle.load(f)

    with open(os.path.join(computed_data_path, 'history_feat_multi_out.pickle'), 'rb') as f: 
        history_agg_out_multi = pickle.load(f)

with open(os.path.join(computed_data_path, 'history_feat_multi_clean.pickle'), 'rb') as f: 
    history_agg_multi = pickle.load(f)

In [None]:
if extended_data: 
    with open(os.path.join(computed_data_path, 'neighborhood_feat_after_impute.pickle'), 'rb') as f: 
        neighborhood_feat_after_impute = pickle.load(f)

    with open(os.path.join(computed_data_path, 'neighborhood_outliers.pickle'), 'rb') as f: 
        neighborhood_outliers = pickle.load(f)
    neighborhood_outliers.index = neighborhood_outliers.index.astype(int)


with open(os.path.join(computed_data_path, 'neighborhood_PC_cluster.pickle'), 'rb') as f: 
    neighborhood_PC_cluster = pickle.load(f)

neighborhood_PC_cluster.index = neighborhood_PC_cluster.index.astype(int)


# Preparation

In [None]:
find_k = False
tnse_hist = False
tnse_neigh = False


## setup

In [None]:
def merge_tables(df_hist, df_neigh):
    
    hist_neigh_inter = df_neigh.index.intersection(df_hist.index)
    len_hist_neigh_inter = len(hist_neigh_inter)
    len_hist_neigh_inter_rel = len(hist_neigh_inter) / len(df_hist.index)
    print('len_hist_neigh_inter: ', len_hist_neigh_inter)
    print('len_hist_neigh_inter_rel: ', len_hist_neigh_inter_rel)


    df_normal = df_hist.merge(df_neigh, how='inner', left_index=True, right_index=True)
    return df_normal

    print('df_normal.shape: ', df_normal.shape)
    
    
def scale_df(df, scale='minmax'): 
    if scale == 'minmax': 
        scaler = MinMaxScaler()
    df_normal = scaler.fit_transform(df)
    df_normal = pd.DataFrame(df_normal, columns=df.columns, index=df.index)
    return df_normal


In [None]:

cl_feat_neigh = neighborhood_PC_cluster.columns.to_list()
cl_feat_neigh

In [None]:
# set holding the class names
cl_names = set()

# define cols

## cols for clsutering 
cl_feat_hist = history_agg_multi.columns.to_list()
print('\ncl_feat_hist: \n',cl_feat_hist)
cl_feat_neigh = neighborhood_PC_cluster.columns.to_list()

metric_features = cl_feat_hist + cl_feat_neigh 
print('\nmetric_features: ', metric_features)
## cols for descirption 
#desc_feat_hist = history_agg.loc[~history_agg.columns.isin(cl_feat_hist),:].columns.to_list()
if extended_data:
    desc_feat_hist = history_agg.columns[~history_agg.columns.isin(cl_feat_hist)].to_list()
    print('\ndesc_feat_hist: \n',desc_feat_hist)



In [None]:
# define rows

cl_donor_ids = history_agg_multi.index.to_list()


In [None]:
df_hist = history_agg_multi[cl_feat_hist].copy()
df_hist

df_neigh = neighborhood_PC_cluster.copy()

# Merge / Scale

In [None]:
merge_scale = False # if false: first scale, than merge
intersect_dfs = True # make sure both dfs have same ids


if merge_scale:
    df_raw = merge_tables(df_hist, df_neigh)
    df_raw

    df_normal = scale_df(df_raw)
    df_normal

    df_hist_normal = df_normal.copy()[cl_feat_hist]
    df_hist_normal

    df_neigh_normal = df_normal.copy()[cl_feat_neigh]
    df_neigh_normal
else: 
    
    df_hist_normal = scale_df(df=df_hist, scale='minmax')
    df_neigh_normal = scale_df(df=df_neigh, scale='minmax')
    
    df_normal = merge_tables(df_hist_normal, df_neigh_normal)
    
    if intersect_dfs:   

        df_hist_normal = df_normal[cl_feat_hist].copy()
        df_neigh_normal = df_normal[cl_feat_neigh].copy()


# Clustering

## Find the correct k 

In [None]:
def get_ss(df):
    """Computes the sum of squares for all variables given a dataset
    """
    ss = np.sum(df.var() * (df.count() - 1))
    return ss  # return sum of sum of squares of each df variable

def r2(df, labels):
    sst = get_ss(df)
    ssw = np.sum(df.groupby(labels).apply(get_ss))
    return 1 - ssw/sst
    
def get_r2_scores(df, clusterer, min_k=1, max_k=10):
    """
    Loop over different values of k. To be used with sklearn clusterers.
    """
    r2_clust = {}
    for n in range(min_k, max_k):
        clust = clone(clusterer).set_params(n_clusters=n)
        labels = clust.fit_predict(df)
        r2_clust[n] = r2(df, labels)
    return r2_clust


# Set up the clusterers
kmeans = KMeans(
    init='k-means++',
    n_init=20,
    random_state=1
)

hierarchical = AgglomerativeClustering(
    affinity='euclidean'
)

def find_k_r2(df):
    r2_scores = {}
    
    print('kmeans')
    r2_scores['kmeans'] = get_r2_scores(df, kmeans)

    if False: 
        for linkage in ['ward']: # 'complete', 'average', 'single',
            print(linkage)
            r2_scores[linkage] = get_r2_scores(
                df, hierarchical.set_params(linkage=linkage)
            )

    pd.DataFrame(r2_scores)
    return r2_scores

# Visualizing the R² scores for each cluster solution on demographic variables
def plot_k_r2(r2_scores):
    filename = r2_scores
    r2_scores = globals()[r2_scores]
    pd.DataFrame(r2_scores).plot.line(figsize=(10,7))

    #plt.title("Demographic Variables:\nR² plot for various clustering methods\n", fontsize=21)
    #plt.legend(title="Cluster methods", title_fontsize=11)
    plt.xlabel("Number of clusters", fontsize=13)
    plt.ylabel("R² metric", fontsize=13)
    plt.savefig(os.path.join(explorations_data_path, f'{filename}.jpeg'), dpi=200)
    plt.show()
    

def find_k_inertia(df): 
    range_clusters = range(1, 11)
    inertia = []
    for n_clus in range_clusters:  # iterate over desired ncluster range
        print(n_clus)
        kmclust = KMeans(n_clusters=n_clus, init='k-means++', n_init=20, random_state=1)
        kmclust.fit(df)
        inertia.append(kmclust.inertia_)  # save the inertia of the given cluster solution
    return inertia

def plot_k_inertia(inertia): 
    filename = inertia
    inertia = globals()[inertia]
    plt.plot(inertia)
    plt.xlabel("Number of clusters", fontsize=13)
    plt.ylabel("Inertia", fontsize=13)
    plt.savefig(os.path.join(explorations_data_path, f'{filename}.jpeg'), dpi=200)

    plt.show()


### Finding the optimal clusterer on promotion history variables

In [None]:
# hist
if find_k:
    #r2
    if True: 
        r2_scores_hist = find_k_r2(df=df_hist_normal)
        plot_k_r2('r2_scores_hist')

    if True:
        # inertia
        inertia_hist = find_k_inertia(df=df_hist_normal)    
        plot_k_inertia(inertia='inertia_hist')



In [None]:
# neigh
if find_k:
    #r2
    if True: 
        r2_scores_neigh = find_k_r2(df=df_neigh_normal)
        plot_k_r2('r2_scores_neigh')

    # inertia
    if True:
        inertia_neigh = find_k_inertia(df=df_neigh_normal)    
        plot_k_inertia(inertia='inertia_neigh')



#### SOM

In [None]:
som = False

if som:

    # This som implementation does not have a random seed parameter
    # We're going to set it up ourselves
    np.random.seed(42)

    # Notice that the SOM did not converge - We're under a time constraint for this class
    sm = sompy.SOMFactory().build(
        df[metric_features].values, 
        mapsize=(50, 50), 
        initialization='random',
        neighborhood='gaussian',
        training='batch',
        lattice='hexa',
        component_names=metric_features
    )
    sm.train(n_job=-1, verbose='info', train_rough_len=100, train_finetune_len=100)

In [None]:
if som:

    # Coordinates of the units in the input space
    sm.get_node_vectors()

In [None]:
if som:

    # Component planes on the 50x50 grid
    sns.set()
    view2D = View2D(12,12,"", text_size=10)
    view2D.show(sm, col_sz=3, what='codebook')
    plt.subplots_adjust(top=0.90)
    plt.suptitle("Component Planes", fontsize=20)
    plt.show()

In [None]:
if som:

    # U-matrix of the 50x50 grid
    u = sompy.umatrix.UMatrixView(12, 12, 'umatrix', show_axis=True, text_size=8, show_text=True)

    UMAT = u.show(
        sm, 
        distance2=1, 
        row_normalized=False, 
        show_data=False, 
        contooor=True # Visualize isomorphic curves
    )

## apply clustering 

In [None]:
# Applying the right clustering (algorithm and number of clusters) for each perspective
kmeans_hist = KMeans(
    n_clusters=2,
    init='k-means++',
    n_init=20,
    random_state=1
)
hist_labels = kmeans_hist.fit_predict(df_hist_normal)

cl_name = 'hist_labels'
if intersect_dfs:
    df_normal[cl_name] = hist_labels
df_hist_normal[cl_name] = hist_labels

cl_names.add(cl_name)

In [None]:
# Applying the right clustering (algorithm and number of clusters) for each perspective
kmeans_neigh = KMeans(
    n_clusters=2,
    init='k-means++',
    n_init=20,
    random_state=1
)
neigh_labels = kmeans_neigh.fit_predict(df_neigh_normal)

cl_name = 'neigh_labels'
if intersect_dfs:
    df_normal[cl_name] = neigh_labels
df_neigh_normal[cl_name] = neigh_labels

cl_names.add(cl_name)

In [None]:
df_normal[cl_names]

## Cluster visualization using t-SNE

In [None]:
def prepare_tsne_data(df, features, label_name, fraq): 

    tsne_df = df.sample(frac=fraq, axis=0, random_state=1)
    tsne_feat = tsne_df[features]
    tsne_feat

    tsne_c = tsne_df[label_name]
    tsne_c
    
    return tsne_feat, tsne_c

def run_tsne(tsne_feat, tsne_c, filename): 
    two_dim = TSNE(random_state=1, n_jobs=-1).fit_transform(tsne_feat)
    # t-SNE visualization
    pd.DataFrame(two_dim).plot.scatter(x=0, y=1, c=tsne_c, colormap='tab10', figsize=(15,10))
    plt.savefig(os.path.join(explorations_data_path, f'tsne_{filename}.jpeg'), dpi=200)
    plt.show()



In [None]:
df_neigh_normal

In [None]:


if tnse_hist:
    tsne_feat, tsne_c = prepare_tsne_data(df=df_hist_normal, features=cl_feat_hist, label_name='hist_labels', fraq=.1)
    run_tsne(tsne_feat, tsne_c, filename='hist')



In [None]:

if tnse_neigh:
    tsne_feat, tsne_c = prepare_tsne_data(df=df_neigh_normal, features=cl_feat_neigh, label_name='neigh_labels', fraq=.1)
    run_tsne(tsne_feat, tsne_c, filename='neigh')



### Merging using Hierarchical clustering

In [None]:
cl_names_ll = list(cl_names)


In [None]:
# Centroids of the concatenated cluster labels
df_centroids = df_normal.groupby(cl_names_ll)\
    [metric_features].mean()
df_centroids

In [None]:
# Using Hierarchical clustering to merge the concatenated cluster centroids
hclust = AgglomerativeClustering(
    linkage='ward', 
    affinity='euclidean', 
    distance_threshold=0, 
    n_clusters=None
)
hclust_labels = hclust.fit_predict(df_centroids)

In [None]:
# Adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py

# create the counts of samples under each node (number of points being merged)
counts = np.zeros(hclust.children_.shape[0])
n_samples = len(hclust.labels_)

# hclust.children_ contains the observation ids that are being merged together
# At the i-th iteration, children[i][0] and children[i][1] are merged to form node n_samples + i
for i, merge in enumerate(hclust.children_):
    # track the number of observations in the current cluster being formed
    current_count = 0
    for child_idx in merge:
        if child_idx < n_samples:
            # If this is True, then we are merging an observation
            current_count += 1  # leaf node
        else:
            # Otherwise, we are merging a previously formed cluster
            current_count += counts[child_idx - n_samples]
    counts[i] = current_count

# the hclust.children_ is used to indicate the two points/clusters being merged (dendrogram's u-joins)
# the hclust.distances_ indicates the distance between the two points/clusters (height of the u-joins)
# the counts indicate the number of points being merged (dendrogram's x-axis)
linkage_matrix = np.column_stack(
    [hclust.children_, hclust.distances_, counts]
).astype(float)

# Plot the corresponding dendrogram
sns.set()
fig = plt.figure()# figsize=(11,5)
# The Dendrogram parameters need to be tuned
y_threshold = .4
dendrogram(linkage_matrix, truncate_mode='level', labels=df_centroids.index.to_list(), p=5, color_threshold=y_threshold, above_threshold_color='k')
plt.hlines(y_threshold, 0, 1000, colors="r", linestyles="dashed")
#plt.title(f'Hierarchical Clustering - {linkage.title()}\'s Dendrogram', fontsize=21)
plt.xlabel('Number of points in node (or index of point if no parenthesis)')
plt.ylabel(f'Euclidean Distance', fontsize=13)
plt.savefig(os.path.join(explorations_data_path, 'dendogram_comb.jpeg'), dpi=200)

plt.show()

In [None]:
dendrogram(linkage_matrix, truncate_mode='level', labels=df_centroids.index.to_list(), p=5, color_threshold=y_threshold, above_threshold_color='k')
linkage_matrix.shape
df_centroids.index.to_list()

In [None]:
# Re-running the Hierarchical clustering based on the correct number of clusters
hclust = AgglomerativeClustering(
    linkage='ward', 
    affinity='euclidean', 
    n_clusters=4
)
hclust_labels = hclust.fit_predict(df_centroids)
df_centroids['hclust_labels'] = hclust_labels

df_centroids  # centroid's cluster labels

In [None]:
# Mapper between concatenated clusters and hierarchical clusters
cluster_mapper = df_centroids['hclust_labels'].to_dict()

df_ = df_normal.copy()

# Mapping the hierarchical clusters on the centroids to the observations
df_['merged_labels'] = df_.apply(
    lambda row: cluster_mapper[
        (row['neigh_labels'], row['hist_labels'])
    ], axis=1
)

# Merged cluster centroids
df_.groupby('merged_labels').mean()[metric_features]

In [None]:
cluster_mapper_df = df_centroids['hclust_labels'].to_frame()
cluster_mapper_df.to_excel(os.path.join(explorations_data_path, 'cluster_mapper.xlsx'))

In [None]:
#Merge cluster contigency table
# Getting size of each final cluster
df_counts = df_.groupby('merged_labels')\
    .size()\
    .to_frame()

# Getting the product and behavior labels
df_counts = df_counts\
    .rename({v:k for k, v in cluster_mapper.items()})\
    .reset_index()

df_counts['neigh_labels'] = df_counts['merged_labels'].apply(lambda x: x[0])
df_counts['hist_labels'] = df_counts['merged_labels'].apply(lambda x: x[1])
df_counts.pivot('neigh_labels', 'hist_labels', 0)

In [None]:
# Setting df to have the final product, behavior and merged clusters
df_normal = df_.copy()
df_normal

In [None]:
cl_names.add('merged_labels')
cl_names

In [None]:
cl_names_ll = list(cl_names)
cl_names_ll

In [None]:
df_[cl_names].to_csv(os.path.join(explorations_data_path, 'cluster_donor_mapping.csv'))

# profiling

## feat importance 

In [None]:
# https://github.com/scikit-learn-contrib/boruta_py
# https://towardsdatascience.com/feature-selection-with-borutapy-f0ea84c9366


# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
# X = pd.read_csv('examples/test_X.csv', index_col=0).values
# y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
# y = y.ravel()

def feat_imp_boruta(X_df,y_s, boruta:True): 
    if boruta: 
        X_features = X_df.columns.to_list()
        X = X_df.values
        y = y_s.values

        print('X.shape:', X.shape)

        # define random forest classifier, with utilising all cores and
        # sampling in proportion to y labels
        rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

        # define Boruta feature selection method
        feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)

        # find all relevant features - 5 features should be selected
        feat_selector.fit(X, y)

        # check selected features - first 5 features are selected
        print(feat_selector.support_)
        # check ranking of features
        print(feat_selector.ranking_)

        # call transform() on X to filter it down to selected features
        X = feat_selector.transform(X)


        # impt
        X_metadata = pd.DataFrame({
        'features': (X_features),
        'support_b': (feat_selector.support_) 
        #,'ranking': (feat_selector.ranking_)
        })


        print(X_metadata.support_b.sum())

        imp_b = X_metadata.set_index('features')

    else: 
        imp_b = pd.DataFrame({'support_b':[np.nan]}, index=X_df.columns)
        imp_b.index.set_names('features', inplace=True)
        imp_b
    
    return imp_b

def feat_imp_std(df, cl_name, th):
    filename = cl_name.replace('_labels', '')
    
    imp = df.groupby(cl_name).mean().std().sort_values(ascending=False)
    imp = imp.to_frame(name='std')
    imp['std_cumsum'] = imp.cumsum() / imp.cumsum().max()
    imp

    sns.lineplot(data=imp, y=imp.index, x='std_cumsum')
    #plt.xticks(rotation=90)
    plt.axvline(th)
    plt.tight_layout()
    plt.savefig(os.path.join(explorations_data_path, f'feat_imp_{filename}.jpeg'), dpi=200)
    plt.show()

    imp['support_std'] = [True if v <=th else False for v in imp.std_cumsum]
    imp
    imp.index.set_names('features', inplace=True)
    return imp 


def feat_imp(df, cl_name, cl_names_all, boruta, th): 
    cl_drop = [x for x in cl_names_all if x != cl_name]
    #print('cl_drop: ', cl_drop)
    df_ = df.copy().drop(columns=cl_drop, errors='ignore') # 

    #print(df_.head())
    
    # boruta
    imp_b = feat_imp_boruta(
        X_df=df_.drop(columns=cl_name, errors='ignore'),
        y_s=df_[cl_name], 
        boruta = boruta
    )


    # std
    imp_std = feat_imp_std(
        df = df_,
        cl_name = cl_name, 
        th = th
    )


    # combine
    imp_comb = pd.concat([imp_b, imp_std], 1)
    
    # overall support
    cols_with_support = ['support_b', 'support_std']
    imp_comb['support'] = imp_comb[cols_with_support].apply(lambda x: np.sum(x), axis=1)
    
    return imp_comb


In [None]:
df_normal

In [None]:
# history

imp_comb_hist = feat_imp(df=df_hist_normal,cl_name='hist_labels',cl_names_all=cl_names, boruta=False, th=.8)
imp_comb_hist


In [None]:
# neigh

imp_comb_neigh = feat_imp(df=df_neigh_normal,cl_name='neigh_labels',cl_names_all=cl_names, boruta=False, th=0.93)
imp_comb_neigh


In [None]:
# merged 
imp_comb_merged = feat_imp(df=df_normal,cl_name='merged_labels',cl_names_all=cl_names, boruta=False, th=0.85)
imp_comb_merged

## leverage

In [None]:

def calc_leverage_values(x): 
    d = {}
    d['total_donations'] = x['donations_total'].sum()
    d['size'] = x.size
    return pd.Series(d)

def build_leverage_df(df, cl_name): 
    leverage_df = df.groupby(cl_name).apply(calc_leverage_values)
    leverage_df['total_donations_rel'] = leverage_df['total_donations'] / leverage_df['total_donations'].sum()
    leverage_df['size_rel'] = leverage_df['size'] / leverage_df['size'].sum()

    leverage_df['leverage'] = round(leverage_df['total_donations_rel'] / leverage_df['size_rel'], 2)
    leverage_df.reset_index(inplace=True)
    return leverage_df

def build_leverage_plotdata(leverage_df, cl_name): 

    # bars
    lev_plotdata = leverage_df[[cl_name, 'total_donations_rel', 'size_rel']].melt(id_vars=cl_name)
    lev_plotdata[cl_name] = lev_plotdata[cl_name].astype(str)

    # text annotations
    max_val_per_cl = leverage_df[['total_donations_rel', 'size_rel']].max(1)
    dist = max_val_per_cl.max() * 0.05
    y_coord = max_val_per_cl + dist
    y_coord_df = y_coord.reset_index(name='y_coord')
    y_coord_df.rename(columns={'index':cl_name}, inplace=True)

    lev_text = leverage_df[[cl_name, 'leverage']]
    #lev_text.loc[:,'y_coord'] = y_coord
    lev_text = lev_text.merge(y_coord_df, on=cl_name)
    lev_text

    return lev_plotdata, lev_text

def plot_leverage(lev_plotdata, lev_text, cl_name, plot_name): 

    g = sns.barplot(data=lev_plotdata, x = cl_name, y='value', hue='variable')

    for _, (x,s,y) in lev_text.iterrows():
        plt.text(x=x,y=y, s=s, horizontalalignment='center')

    g.set_ylim(top=g.get_ylim()[1]*1.1)

    plt.tight_layout()
    #print(g.get_ylim())
    plt.savefig(os.path.join(explorations_data_path, f'leverage_{plot_name}.jpeg'), dpi=200)
    plt.show()

def build_plot_leverage(df, cl_name, plot_name): 
    leverage_df = build_leverage_df(df, cl_name)
    lev_plotdata, lev_text = build_leverage_plotdata(leverage_df, cl_name)
    plot_leverage(lev_plotdata, lev_text, cl_name, plot_name)
    return leverage_df




In [None]:

leverage_df = build_plot_leverage(
    df=df_normal, 
    cl_name='hist_labels', 
    plot_name='hist_test'
)

#leverage_df

In [None]:

leverage_df = build_plot_leverage(
    df=df_normal, 
    cl_name='neigh_labels', 
    plot_name='neigh'
)

leverage_df

In [None]:

leverage_df = build_plot_leverage(
    df=df_normal, 
    cl_name='merged_labels', 
    plot_name='merged'
)

leverage_df

## mean

In [None]:
def cluster_profiles(df, label_columns, figsize, compar_titles=None):
    """
    Pass df with labels columns of one or multiple clustering labels. 
    Then specify this label columns to perform the cluster profile according to them.
    """
    if compar_titles == None:
        compar_titles = [""]*len(label_columns)
        
    sns.set()
    fig, axes = plt.subplots(nrows=len(label_columns), ncols=2, figsize=figsize, squeeze=False)
    for ax, label, titl in zip(axes, label_columns, compar_titles):
        # Filtering df
        drop_cols = [i for i in label_columns if i!=label]
        dfax = df.drop(drop_cols, axis=1)
        
        # Getting the cluster centroids and counts
        centroids = dfax.groupby(by=label, as_index=False).mean()
        counts = dfax.groupby(by=label, as_index=False).count().iloc[:,[0,1]]
        counts.columns = [label, "counts"]
        
        # Setting Data
        pd.plotting.parallel_coordinates(centroids, label, color=sns.color_palette(), ax=ax[0])
        sns.barplot(x=label, y="counts", data=counts, ax=ax[1])

        #Setting Layout
        handles, _ = ax[0].get_legend_handles_labels()
        cluster_labels = ["Cluster {}".format(i) for i in range(len(handles))]
        #print('cluster_labels: ', cluster_labels)
        
        color_mapper = {}
        for handle in handles: 
            color_mapper[handle.get_label()] = handle.get_color()
        
        #print('handles: ', handles[0].get_color())
        #print('handles: ', handles[0].get_label())


        ax[0].annotate(text=titl, xy=(0.95,1.1), xycoords='axes fraction', fontsize=13, fontweight = 'heavy') 
        ax[0].legend(handles, cluster_labels) # Adaptable to number of clusters
        ax[0].axhline(color="black", linestyle="--")
        ax[0].set_title("Cluster Means - {} Clusters".format(len(handles)), fontsize=13)
        ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=30)
        ax[1].set_xticklabels(cluster_labels)
        ax[1].set_xlabel("")
        ax[1].set_ylabel("Absolute Frequency")
        ax[1].set_title("Cluster Sizes - {} Clusters".format(len(handles)), fontsize=13)
    
    #plt.subplots_adjust(hspace=0.4, top=0.90)
    plt.tight_layout()
    plt.title = ''
#    plt.suptitle("Cluster Simple Profilling", fontsize=23)

    filename = ''.join(label_columns)
    plt.savefig(os.path.join(explorations_data_path, f'profile_mean_{filename}.jpeg'), dpi=200)

    plt.show()
    
    return color_mapper

In [None]:
imp_comb_hist

In [None]:
# history
feat_supp_hist = imp_comb_hist.loc[imp_comb_hist.support >= 1,].index.to_list()
print('feat_supp_hist: ', feat_supp_hist)
cl_name_hist = 'hist_labels'
#df_hist_normal[feat_supp_hist + [cl_name_hist]].groupby(cl_name_hist).mean().T

my_dpi = 200
figsize=(
    # 15,13
    1800/my_dpi, 1000/my_dpi
)
    
cp = cluster_profiles(
    df = df_hist_normal[feat_supp_hist + [cl_name_hist]], 
    label_columns = [cl_name_hist], 
    figsize = figsize, #None,#(28, 13), 
    compar_titles = ["History clustering"]
)

In [None]:
# neigh
feat_supp_neigh = imp_comb_neigh.loc[imp_comb_neigh.support >= 1,].index.to_list()
print('feat_supp_neigh: ', feat_supp_neigh)
cl_name_neigh = 'neigh_labels'
#df_hist_normal[feat_supp_hist + [cl_name_hist]].groupby(cl_name_hist).mean().T

my_dpi = 200
figsize=(
    # 15,13
    1800/my_dpi, 1000/my_dpi
)
    
cp = cluster_profiles(
    df = df_neigh_normal[feat_supp_neigh + [cl_name_neigh]], 
    label_columns = [cl_name_neigh], 
    figsize = figsize, #None,#(28, 13), 
    compar_titles = ["Neighborhood clustering"]
)

In [None]:
# merged
feat_supp_merged = imp_comb_merged.loc[imp_comb_merged.support >= 1,].index.to_list()
print('feat_supp_neigh: ', feat_supp_merged)
cl_name_merged = 'merged_labels'
#df_hist_normal[feat_supp_hist + [cl_name_hist]].groupby(cl_name_hist).mean().T

my_dpi = 200
figsize=(
    # 15,13
    1800/my_dpi, 1000/my_dpi
)
    
cp = cluster_profiles(
    df = df_normal[feat_supp_merged + [cl_name_merged]], 
    label_columns = [cl_name_merged], 
    figsize = figsize, #None,#(28, 13), 
    compar_titles = ["Merged clustering"]
)


In [None]:
plotdata = pd.DataFrame(cp).T.sort_values(0)
plotdata
sns.countplot(x = plotdata.index, palette=cp)
#list(cp.keys())

with open(os.path.join(explorations_data_path, 'cluster_color_mapper.pickle'), 'wb') as f: 
    pickle.dump(cp, f)


In [None]:
from matplotlib.colors import rgb2hex
rgb2hex((0.8666666666666667, 0.5176470588235295, 0.3215686274509804))

In [None]:
cluster_mapper_df

In [None]:

df_normal[feat_supp_merged + [cl_name_merged]].groupby(cl_name_merged).apply(lambda x: round(np.mean(x), 4))\
    .to_excel(os.path.join(explorations_data_path, 'feat_mean_per_cluster.xlsx'))