# FCA

Technical Challenge for Data Science Candidates

This workbook loads the data for the Bank example.
Code the data and check with correlations. Images are another book.

In [1]:
import numpy as np
import pandas as pd
import math
import json

from os import path

import scipy.stats as st
import statsmodels as sm
import statsmodels.api as smi

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

pd.__version__

'0.24.2'

In [2]:
# this is the local Utility module
from fca import Utility

In [3]:
%load_ext autoreload
%autoreload 1
%aimport fca

In [4]:
# If you turn this feature on, you can display each result as it happens.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
# My utility singleton.
i0 = Utility.instance()

In [19]:
df0 = pd.read_csv("in.csv", sep=";")

## Data manipulation

In [7]:
df0.info()
df0.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
## Heuristic steps
# Having looked ahead, the correlations and density plots suggest.

In [8]:
# Convert strings to categories
df1 = i0.str2cat(df0)
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null category
marital           41188 non-null category
education         41188 non-null category
default           41188 non-null category
housing           41188 non-null category
loan              41188 non-null category
contact           41188 non-null category
month             41188 non-null category
day_of_week       41188 non-null category
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null category
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null category
dtypes: category(11), float64

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# Visualisation output
The following will convert the dataset to fully numeric form. With that, it can be correlated and viewed in another notebook.

In [15]:
# Convert categories to integers and scale
df2 = i0.cat2code(df1)

df2.info()
df2.head()

# Let me look at it with R and do some visualisation
df2.to_csv("coded.csv", index=False)
df2.to_pickle("coded.pickle")

# df3 = i0.code2scale(df2, scaler0=StandardScaler(with_std=False))
df3 = i0.code2scale(df2, scaler0=StandardScaler(with_std=True))
df3.to_pickle("scaled.pickle")

# Visualisation and models are in other notebooks.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null float64
job               41188 non-null float64
marital           41188 non-null float64
education         41188 non-null float64
default           41188 non-null float64
housing           41188 non-null float64
loan              41188 non-null float64
contact           41188 non-null float64
month             41188 non-null float64
day_of_week       41188 non-null float64
duration          41188 non-null float64
campaign          41188 non-null float64
pdays             41188 non-null float64
previous          41188 non-null float64
poutcome          41188 non-null float64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null float64
dtypes: float64(21)
memory usa

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.533034,-0.201579,-0.283741,-1.753925,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
1,1.628993,0.911227,-0.283741,-0.34973,1.945327,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
2,-0.290186,0.911227,-0.283741,-0.34973,-0.5136,0.942127,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
3,-0.002309,-1.036184,-0.283741,-1.28586,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
4,1.533034,0.911227,-0.283741,-0.34973,-0.5136,-1.087707,2.31144,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309


In [10]:
# Check the statistics
ds = i0.df2describe(df2)
ds;

In [11]:
list(df2.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'y']

In [12]:
## Some investigation of the Near-Zero Variance features
thresh0 = 0.7
thresh1 = thresh0 * (1 - thresh0)
nzv0 = i0.nzv(df3, thresh=thresh1)

In [13]:
ds = i0.df2describe(df3)
ds[ds['name'].isin(nzv0)]

Unnamed: 0,name,q,v
32,default,count,41188.0
33,default,mean,3.067676e-14
34,default,std,0.4066865
35,default,min,-0.2088715
36,default,25%,-0.2088715
37,default,50%,-0.2088715
38,default,75%,-0.2088715
39,default,max,1.791128
112,poutcome,count,41188.0
113,poutcome,mean,-1.580763e-14


# What is default

Is it an historical variable. These might be conditional.

In [29]:
all(~((df0.default == 'yes') & (df0.y == 'yes')))

True

In [30]:
all(~((df0.default == 'yes') & (df0.y == 'yes')))

True