# Data Inspections

Brief inspections of the data.

# Goal

To check a summary of the data.

# Plan

Checklist what will be done on the notebook :

    [*] Create an identifier column (if not exist)
    [*] Columns and Rows Counts
    [*] Target Descriptions
    [*] Possible Variable
    [*] Train and Test separation - 85:15


In [1]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
with open("../../config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

PATH_RAW = config['path']['raw']
PATH_INTERIM = config['path']['interim']
PATH_PROCESSED = config['path']['processed']
PATH_MODEL = config['path']['artifact']+os.getcwd().split('\\')[-1]+'/'
PATH_UDF = config['path']['udf']

## Get Data

In [3]:
# df = pd.read_csv(PATH_RAW+'test-cookie.csv')
df_excel = pd.ExcelFile(PATH_RAW+'bank-full.xlsx')

In [4]:
df_excel.sheet_names

['bank-names', 'bank-full']

In [5]:
df_excel.parse('bank-names', header=None)

Unnamed: 0,0
0,1 - age (numeric)
1,"2 - job : type of job (categorical: ""admin."",""..."
2,"blue-collar,""self-employed"",""retired"",""technic..."
3,"3 - marital : marital status (categorical: ""ma..."
4,"4 - education (categorical: ""unknown"",""seconda..."
5,"5 - default: has credit in default? (binary: ""..."
6,"6 - balance: average yearly balance, in euros ..."
7,"7 - housing: has housing loan? (binary: ""yes"",..."
8,"8 - loan: has personal loan? (binary: ""yes"",""no"")"
9,# related with the last contact of the current...


In [6]:
raw_data = df_excel.parse('bank-full')

## Data Summary

In [7]:
raw_data.shape

(45211, 17)

In [8]:
raw_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
## 7 most likely Categorical
## 9 most likely Numerical

In [10]:
## Create Unique Identifier
raw_data['ids'] = range(len(raw_data))

## Target Descriptions

In [11]:
raw_data['y'].value_counts(normalize = True) ## --> Classifications

no     0.883015
yes    0.116985
Name: y, dtype: float64

In [12]:
## Imbalance Dataset

## Possible Features

Numerical
* age
* balance
* day
* duration
* pdays

Categorical
* job
* contact
* education
* marital status
* has credit

In [14]:
raw_data2 = raw_data[['ids','age','balance','day',
                      'duration','pdays','job','contact','education',
                      'marital','default','y']]

In [15]:
raw_tes = raw_data2.sample(int(len(raw_data)*0.15), random_state = 134) ## 20% test
raw_trn = raw_data2[~(raw_data2.ids.isin(raw_tes['ids']))]

In [16]:
raw_tes.to_csv(PATH_INTERIM+'data_tes_iter2.csv',index = False)
raw_trn.to_csv(PATH_INTERIM+'data_trn_iter2.csv',index = False)