# FCA

Technical Challenge for Data Science Candidates

Data checking and prototyping

In [1]:
import numpy as np
import pandas as pd
import math
import json

from os import path

import scipy.stats as st
import statsmodels as sm
import statsmodels.api as smi

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

import matplotlib
from cycler import cycler
import matplotlib.pyplot as plt

# this is the local Utility module
from fca import Utility

%load_ext autoreload
%autoreload 2

pd.__version__

'0.24.2'

In [2]:
# If you turn this feature on, you can display each result as it happens.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
i0 = Utility.instance()

In [4]:
df0 = pd.read_csv("in.csv", sep=";")

In [5]:
df0.info()
df0.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
df1 = i0.str2cat(df0)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null category
marital           41188 non-null category
education         41188 non-null category
default           41188 non-null category
housing           41188 non-null category
loan              41188 non-null category
contact           41188 non-null category
month             41188 non-null category
day_of_week       41188 non-null category
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null category
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null category
dtypes: category(11), float64

In [7]:
splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5])

s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))

In [8]:
df2 = i0.cat2code(df1)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null int8
marital           41188 non-null int8
education         41188 non-null int8
default           41188 non-null int8
housing           41188 non-null int8
loan              41188 non-null int8
contact           41188 non-null int8
month             41188 non-null int8
day_of_week       41188 non-null int8
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null int8
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null int8
dtypes: float64(5), int64(5), int8(11)
memory usage: 3.6 MB


In [9]:
df2.y.describe()['std']
df2.head()

0.3161734269429649

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [10]:
df3 = i0.code2scale(df2)
df3.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.533034,-0.201579,-0.283741,-1.753925,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
1,1.628993,0.911227,-0.283741,-0.34973,1.945327,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
2,-0.290186,0.911227,-0.283741,-0.34973,-0.5136,0.942127,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
3,-0.002309,-1.036184,-0.283741,-1.28586,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309
4,1.533034,0.911227,-0.283741,-0.34973,-0.5136,-1.087707,2.31144,1.31827,0.762558,-0.718834,...,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168,-0.356309


In [11]:
# [ df3[x].describe() for x in df.columns ]
x0 = df3.age.describe()
x1 = df3.job.describe()
pd.concat([x0, x1])

count    4.118800e+04
mean     2.846557e-16
std      1.000012e+00
min     -2.209365e+00
25%     -7.699804e-01
50%     -1.942267e-01
75%      6.694038e-01
max      5.563310e+00
count    4.118800e+04
mean    -3.143338e-15
std      1.000012e+00
min     -1.036184e+00
25%     -1.036184e+00
50%     -4.797808e-01
75%      9.112268e-01
max      2.024033e+00
dtype: float64

In [87]:
ds = i0.df2describe(df3)
ds

Unnamed: 0,index,name,q,v
0,0,age,count,4.118800e+04
1,1,age,mean,2.846557e-16
2,2,age,std,1.000012e+00
3,3,age,min,-2.209365e+00
4,4,age,25%,-7.699804e-01
5,5,age,50%,-1.942267e-01
6,6,age,75%,6.694038e-01
7,7,age,max,5.563310e+00
8,0,job,count,4.118800e+04
9,1,job,mean,-3.143338e-15
