<a href="https://colab.research.google.com/github/deshanchathusanka/human-centered-visual-analytics/blob/main/bank_marketing_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# &nbsp;$\color{Blue}{\text{1. Altair Tutorial}}$

In [129]:
!pip install altair vega_datasets
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [130]:
import altair as alt
from vega_datasets import data
from google.colab import drive
import pandas as pd
from numpy import NaN
from altair import datum

In [131]:
drive.mount('/content/drive', force_remount = True)
!ln -s "/content/drive/My Drive/Academic/CSCM 27 - HCVA/data"

Mounted at /content/drive
ln: failed to create symbolic link './data': File exists


In [132]:
source = data.cars()
source.head()

alt.Chart(source).mark_circle(size=60).encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
    tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']
).interactive()

# &nbsp;$\color{Blue}{\text{2. Dataset Selection}}$

## 2.1 Load dataset

| Attribute | Definition | Type | Values|
| --- | --- | --- | --- |
| Age | Age | Numerical ||
| Job | Job | Categorical||
| Marital | Marital status | Categorical ||
| Education | Education level | Categorical ||
| Default | Credit in default | Categorical ||
| Balance | Account balance | Numerical ||
| Housing | Housing loan | Categorical ||
| Loan | Personal loan | Categorical ||
| Contact | Communication Type | Categorical | Cellular, telephone |
| Day | Last contact day of week | Categorical ||
| Month | Last contact month of year | Categorical ||
| Duration | Last contact duration in seconds | Numerical ||
| Campaign | Number of contact performed for this campaign| Numerical ||
| Pdays | Number of days passed after previous campaign | Numerical ||
| Previous | Number of contact performed before the campaign | Numerical ||
| Poutcome | Outcome of the previous campaign | Categorical ||

In [133]:
df = pd.read_csv(filepath_or_buffer = '/content/data/bank/bank.csv', sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [134]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [135]:
df['job'] = df['job'].astype('category')
df['marital'] = df['marital'].astype('category')
df['education'] = df['education'].astype('category')
df['default'] = df['default'].astype('category')
df['housing'] = df['housing'].astype('category')
df['loan'] = df['loan'].astype('category')
df['contact'] = df['contact'].astype('category')
df['month'] = df['month'].astype('category')
df['poutcome'] = df['poutcome'].astype('category')
# df['day_of_week'] = df['day_of_week'].astype('category')
df['y'] = df['y'].astype('category')

df['job_cat']  = df['job'].cat.codes
df['marital_cat']  = df['marital'].cat.codes
df['education_cat']  = df['education'].cat.codes
df['default_cat']  = df['default'].cat.codes
df['housing_cat']  = df['housing'].cat.codes
df['loan_cat']  = df['loan'].cat.codes
df['contact_cat']  = df['contact'].cat.codes
df['month_cat']  = df['month'].cat.codes
df['poutcome_cat']  = df['poutcome'].cat.codes
# df['day_of_week'] = df['day_of_week'].cat.codes
df[df['poutcome']=='success']

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,y,job_cat,marital_cat,education_cat,default_cat,housing_cat,loan_cat,contact_cat,month_cat,poutcome_cat
49,61,admin.,married,unknown,no,4629,yes,no,cellular,27,...,yes,0,1,3,0,1,0,0,4,2
70,37,management,married,tertiary,no,0,no,no,cellular,16,...,yes,4,1,2,0,0,0,0,5,2
98,36,blue-collar,divorced,secondary,no,2843,no,no,cellular,12,...,no,1,0,1,0,0,0,0,3,2
110,21,student,single,secondary,no,2488,no,no,cellular,30,...,yes,8,2,1,0,0,0,0,6,2
156,55,admin.,married,secondary,no,602,no,no,cellular,1,...,yes,0,1,1,0,0,0,0,6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4309,40,services,single,secondary,no,1547,yes,no,cellular,12,...,no,7,2,1,0,1,0,0,8,2
4323,83,retired,divorced,primary,no,0,no,no,telephone,31,...,no,5,0,0,0,0,0,1,8,2
4433,40,management,married,tertiary,no,-17,yes,yes,cellular,11,...,yes,4,1,2,0,1,1,0,8,2
4458,29,management,single,tertiary,no,572,no,no,cellular,16,...,yes,4,2,2,0,0,0,0,2,2


# &nbsp;$\color{Blue}{\text{3. Implementation}}$

## 3.1 Implementation 01 (Normalized Parallel Coordinates)

In [136]:
selected_features = ['age','job_cat','marital_cat','education_cat','default_cat','housing_cat','loan_cat','contact_cat','day', 'month_cat',
                     'duration','campaign','previous','pdays','poutcome_cat']
selected_df = df[selected_features]
df_norm = (selected_df - selected_df.min())/(selected_df.max() - selected_df.min())
df_norm['y'] = df['y']

alt.Chart(df_norm).transform_window(
    index='count()'
).transform_fold(
    selected_features
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color = alt.Color('y:N', scale = alt.Scale(domain = ['yes', 'no'],
                      range = ['red', 'green'])),
    detail='index:N',
    opacity=alt.value(0.5)
).properties(width=1000)

In [137]:
selected_features = ['age','job_cat','marital_cat','education_cat','default_cat','housing_cat','loan_cat','contact_cat','day_of_week', 'month_cat',
                     'duration','campaign','previous','pdays','poutcome_cat', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# &nbsp;$\color{Red}{\text{References}}$

[1] https://altair-viz.github.io/getting_started/installation.html

[2] https://www.xenonstack.com/blog/data-visualization-techniques

[3] https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset

[4] Koklu, M., & Ozkan, I. A. (2020). Multiclass classification of dry beans using computer vision and machine learning techniques. Computers and Electronics in Agriculture, 174, 105507.

[5] https://colab.research.google.com/github/altair-viz/altair-tutorial/blob/master/notebooks/Index.ipynb

[6] https://online.hbs.edu/blog/post/data-visualization-techniques

[7] https://altair-viz.github.io/user_guide/compound_charts.html?highlight=facet

[8] https://cast42.github.io/blog/datascience/python/clustering/altair/shap/2020/04/23/explain-clusters-to-business.html

[9] https://altair-viz.github.io/gallery/index.html

[10]https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

[11] https://www.youtube.com/watch?v=MnRskV3NY1k