In [1]:
# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
import warnings
warnings.filterwarnings('ignore')
import dtale
from MulticoreTSNE import MulticoreTSNE as mcTSNE

from sklearn.impute import KNNImputer
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import *

from itertools import product
from pandas_profiling import ProfileReport
# from prince import MCA

%config InlineBackend.figure_format = 'retina'

# Seeting seaborn style
sns.set()

from datetime import datetime
# import calendar
# import missingno as msno
from sklearn.impute import SimpleImputer
import scipy.stats as ss
from sklearn.ensemble import IsolationForest

import hdbscan
from helper_functions import *
template = "plotly_dark"

colors = (["#505050", "#d1675b"])
import plotly.express as px
%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'MulticoreTSNE'

In [None]:
data = pd.read_pickle("datasets/data_PCA.pkl")

In [None]:
data["RFA"] = data["RECENCY"].astype(str) + data["FREQUENCY"].astype(str) + data["AMOUNT"].astype(str)
data.drop(columns=["RECENCY", "FREQUENCY", "AMOUNT"], inplace=True)

In [None]:
features = ['DOMAIN', 'STATE', 'AGE', 'INCOME', 'GENDER', 'RECENCY', 'FREQUENCY', 'AMOUNT', 'RAMNTALL', 
            'NGIFTALL', 'LASTGIFT', 'AVGGIFT', 'LASTDATE_MONTHS', 'FIRSTDATE_MONTHS']

metric_features = ['AGE','RAMNTALL', 'LASTGIFT', 'NGIFTALL',  'AVGGIFT','LASTDATE_MONTHS', 'FIRSTDATE_MONTHS']

pca_features = ['PC0', 'PC1', 'PC2', 'PC3',]

non_metric_features = ['DOMAIN', 'STATE','INCOME', 'GENDER', "RFA", ]

In [None]:
data.columns

In [None]:
encoder = OrdinalEncoder()

data[(non_metric_features_encoded := [f"enc_{feature}" for feature in non_metric_features])] = encoder.fit_transform(data[non_metric_features])
data[non_metric_features_encoded]

In [None]:
# We don't use STATE here because the observations are 18% from california and there are states with less than 0.01%
iforests = IsolationForest(n_jobs=-1, contamination=0.05, bootstrap=True, max_samples=0.05, n_estimators=50)
data["outlier_if"] = (iforests.fit_predict(data[["enc_INCOME", "enc_GENDER", "enc_RFA", "enc_DOMAIN"] + pca_features + metric_features]) * -1 + 1) // 2

In [None]:
px.scatter(data, x="PC0", y="RFA", color="outlier_if", color_continuous_scale=colors, opacity=0.5)

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
clusterer.fit(data[metric_features + pca_features])

threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
data["outliers_hdbscan"] = np.zeros_like(data["outlier_if"])
data.loc[outliers, "outliers_hdbscan"] = 1
px.scatter(data, x="PC0", y="RFA", color="outliers_hdbscan", color_continuous_scale=colors, opacity=0.5)

In [None]:
px.scatter(data, x="RAMNTALL", y="RFA", color="outlier", color_continuous_scale=colors, opacity=0.5)

In [None]:
rfa_categories = []
for letter in ["A","L","I"]:
    for i in range(1, 5):
        for letter2 in ["A", "B", "C", "D", 'E', 'F', 'G']:
            rfa_categories.append(letter+str(i)+letter2)
            
data["RFA"] = pd.Categorical(data["RFA"], categories=rfa_categories, ordered=False)

In [None]:
# I think by choosing outliers we just picked the actual best cluster...!
px.scatter_3d(data, x="AVGGIFT", y="RAMNTALL", z="RFA", color="outlier", color_continuous_scale=colors, opacity=0.5, height=800)

In [None]:
print(f"percentage of outliers: {data.outliers_merge.value_counts(normalize=True)[1]}")

In [None]:
df = data.copy()
df = df.reset_index().drop("index", axis=1, errors="ignore")
df.columns = [
    str(c) for c in df.columns
]  # update columns to strings in case they are numbers

var = "outlier"

chart_data = pd.concat([df["STATE"], df[var],], axis=1)
chart_data = chart_data.groupby(["STATE"])[[var]].mean().reset_index()
chart_data = chart_data.dropna(subset=["STATE"])

import plotly.graph_objs as go

chart = go.Choropleth(
    locations=chart_data["STATE"],
    locationmode="USA-states",
    z=chart_data[var],
    colorscale=[
        [0.0, colors[1]],
        [0.05, "#fdcab4"],
        #     [0.4, '#fc8a6a'],
#         [0.5, colors[1]],
#         [0.9, colors[1]],
        [1.0, "#fff5f0"],
    ],
    colorbar_title=var,
    zmin=chart_data[var].min(),
    zmax=chart_data[var].max(),
)
figure = go.Figure(
    data=[chart],
    layout=go.Layout(
        {
            "autosize": True,
            "geo": {"scope": "usa"},
            "legend": {"orientation": "h"},
            "margin": {"b": 0, "l": 0, "r": 0},
            "title": {"text": f"Map of {var} (Mean)"},
        }
    ),
)
figure.show()

In [None]:
data.to_pickle("datasets/data_outliers.pkl")

In [None]:
data = pd.read_pickle("datasets/data_outliers.pkl")

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=1000,min_samples=1, core_dist_n_jobs=-1)
clusterer.fit(data[metric_features])

In [None]:
pd.Series(clusterer.labels_).value_counts()

In [None]:
tsne_xs = np.load_("tsne/tsne_default.npy")

In [None]:
# tsne = TSNE()
# tsne_xs = tsne.fit_transform(data[metric_features])

In [None]:
np.save("tsne/tsne_default", tsne_xs)

In [None]:
data["cluster"] = clusterer.labels_
px.scatter(x=tsne_xs[:, 0], y=tsne_xs[:, 1], color=data["cluster"].astype(str), opacity=1, height=800)