<a href="https://colab.research.google.com/github/deshanchathusanka/human-centered-visual-analytics/blob/main/bank_marketing_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# &nbsp;$\color{Blue}{\text{1. Altair Tutorial}}$

In [1]:
!pip install altair vega_datasets
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 3.7 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.8.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.6 MB/s 
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=445870b7d542a3911f0b109aced221aac235c77111bbbde8ac27793a4876b762
  Stored in directory: /root/.cache/pip/wheels/b3/52/a5/1fd9e3e76a7ab34f134c07469cd6f16e27ef3a37aeff1fe821
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.8-py3-none-any.whl siz

In [2]:
import altair as alt
from vega_datasets import data
from google.colab import drive
import pandas as pd
from numpy import NaN
from altair import datum

In [3]:
drive.mount('/content/drive', force_remount = True)
!ln -s "/content/drive/My Drive/Academic/CSCM 27 - HCVA/data"

Mounted at /content/drive


In [4]:
source = data.cars()
source.head()

alt.Chart(source).mark_circle(size=60).encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
    tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']
).interactive()

# &nbsp;$\color{Blue}{\text{2. Dataset Selection}}$

## 2.1 Load dataset

| Attribute | Definition | Type | Values|
| --- | --- | --- | --- |
| Age | Age | Numerical ||
| Job | Job | Categorical||
| Marital | Marital status | Categorical ||
| Education | Education level | Categorical ||
| Default | Credit in default | Categorical ||
| Balance | Account balance | Numerical ||
| Housing | Housing loan | Categorical ||
| Loan | Personal loan | Categorical ||
| Contact | Communication Type | Categorical | Cellular, telephone |
| Day | Last contact day of week | Categorical ||
| Month | Last contact month of year | Categorical ||
| Duration | Last contact duration in seconds | Numerical ||
| Campaign | Number of contact performed for this campaign| Numerical ||
| Pdays | Number of days passed after previous campaign | Numerical ||
| Previous | Number of contact performed before the campaign | Numerical ||
| Poutcome | Outcome of the previous campaign | Categorical ||

In [5]:
df = pd.read_csv(filepath_or_buffer = '/content/data/bank-additional/bank-additional/bank-additional.csv', sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [6]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [7]:
df['job'] = df['job'].astype('category')
df['marital'] = df['marital'].astype('category')
df['education'] = df['education'].astype('category')
df['default'] = df['default'].astype('category')
df['housing'] = df['housing'].astype('category')
df['loan'] = df['loan'].astype('category')
df['contact'] = df['contact'].astype('category')
df['month'] = df['month'].astype('category')
df['poutcome'] = df['poutcome'].astype('category')
df['day_of_week'] = df['day_of_week'].astype('category')
df['y'] = df['y'].astype('category')

df['job_cat']  = df['job'].cat.codes
df['marital_cat']  = df['marital'].cat.codes
df['education_cat']  = df['education'].cat.codes
df['default_cat']  = df['default'].cat.codes
df['housing_cat']  = df['housing'].cat.codes
df['loan_cat']  = df['loan'].cat.codes
df['contact_cat']  = df['contact'].cat.codes
df['month_cat']  = df['month'].cat.codes
df['poutcome_cat']  = df['poutcome'].cat.codes
df['day_of_week'] = df['day_of_week'].cat.codes

df.rename(columns = {'emp.var.rate': 'emp_var_rate',
                     'cons.price.idx' : 'cons_price_idx',
                     'cons.conf.idx' : 'cons_conf_idx',
                     'nr.employed': 'nr_employed'
                     }, inplace = True)
df.dtypes

age                  int64
job               category
marital           category
education         category
default           category
housing           category
loan              category
contact           category
month             category
day_of_week           int8
duration             int64
campaign             int64
pdays                int64
previous             int64
poutcome          category
emp_var_rate       float64
cons_price_idx     float64
cons_conf_idx      float64
euribor3m          float64
nr_employed        float64
y                 category
job_cat               int8
marital_cat           int8
education_cat         int8
default_cat           int8
housing_cat           int8
loan_cat              int8
contact_cat           int8
month_cat             int8
poutcome_cat          int8
dtype: object

In [8]:
df['poutcome'].cat.categories

Index(['failure', 'nonexistent', 'success'], dtype='object')

In [9]:
df[df['poutcome']=='success']

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,y,job_cat,marital_cat,education_cat,default_cat,housing_cat,loan_cat,contact_cat,month_cat,poutcome_cat
21,39,technician,divorced,high.school,no,no,no,cellular,mar,1,...,yes,9,0,3,0,0,0,0,5,2
53,24,technician,married,university.degree,no,yes,yes,cellular,sep,3,...,yes,9,1,6,0,2,2,0,9,2
62,52,admin.,married,unknown,no,yes,no,telephone,apr,4,...,yes,0,1,7,0,2,0,1,0,2
68,32,blue-collar,married,basic.9y,no,no,no,cellular,nov,1,...,yes,1,1,2,0,0,0,0,7,2
83,57,retired,married,university.degree,no,yes,no,cellular,apr,2,...,no,5,1,6,0,2,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3879,50,self-employed,married,basic.9y,no,yes,yes,cellular,may,1,...,yes,6,1,2,0,2,2,0,6,2
3983,43,admin.,married,university.degree,no,no,yes,cellular,aug,3,...,no,0,1,6,0,0,2,0,1,2
4014,36,self-employed,single,university.degree,no,yes,no,cellular,nov,1,...,no,6,2,6,0,2,0,0,7,2
4088,47,management,married,university.degree,no,yes,no,cellular,sep,1,...,yes,4,1,6,0,2,0,0,9,2


# &nbsp;$\color{Blue}{\text{3. Implementation}}$

## 3.1 Implementation 01 (Normalized Parallel Coordinates)

In [10]:
selected_features = ['age','job_cat','marital_cat','education_cat','default_cat','housing_cat','loan_cat','contact_cat','day_of_week', 'month_cat',
                     'duration','campaign','previous','pdays','poutcome_cat', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']
selected_df = df[selected_features]
df_norm = (selected_df - selected_df.min())/(selected_df.max() - selected_df.min())
df_norm['y'] = df['y']

alt.Chart(data = df_norm,
          title = 'Parallel Coordinate Visualization').transform_window(
    index='count()'
).transform_fold(
    selected_features
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color = alt.Color('y:N', scale = alt.Scale(domain = ['yes', 'no'],
                      range = ['red', 'green'])),
    detail='index:N',
    opacity=alt.value(0.5)
).properties(
    width = 1200
)

${\color{Red}{\text{Prominent features :}}
\color{Blue}{\text{Age, Consumer confident index, Duration, Previous outcome}
}}$

## 3.2 Implementation 2

In [60]:
df['y_code'] = df['y'].cat.codes
df[df['y_code']==1][['y','y_code']]

Unnamed: 0,y,y_code
19,yes,1
21,yes,1
25,yes,1
53,yes,1
59,yes,1
...,...,...
4034,yes,1
4066,yes,1
4069,yes,1
4088,yes,1


In [105]:
alt.Chart(data = df).mark_rect().encode(
    alt.X(alt.repeat("column"), type='quantitative', bin=alt.Bin(maxbins=10)),
    alt.Y(alt.repeat("row"), type='quantitative', bin=alt.Bin(maxbins=10)),
    alt.Color('mean(y_code):Q', scale=alt.Scale(scheme='plasma'), title = 'success rate')
).properties(
    width=200,
    height=200
).repeat(
    row=['age','duration', 'cons_conf_idx'],
    column=['age','duration','cons_conf_idx']
).interactive()

# &nbsp;$\color{Red}{\text{References}}$

[1] https://altair-viz.github.io/getting_started/installation.html

[2] https://www.xenonstack.com/blog/data-visualization-techniques

[3] https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset

[4] Koklu, M., & Ozkan, I. A. (2020). Multiclass classification of dry beans using computer vision and machine learning techniques. Computers and Electronics in Agriculture, 174, 105507.

[5] https://colab.research.google.com/github/altair-viz/altair-tutorial/blob/master/notebooks/Index.ipynb

[6] https://online.hbs.edu/blog/post/data-visualization-techniques

[7] https://altair-viz.github.io/user_guide/compound_charts.html?highlight=facet

[8] https://cast42.github.io/blog/datascience/python/clustering/altair/shap/2020/04/23/explain-clusters-to-business.html

[9] https://altair-viz.github.io/gallery/index.html

[10]https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

[11] https://www.youtube.com/watch?v=MnRskV3NY1k