In [None]:
import datetime
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly as py
from ipywidgets import widgets
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as ss
from statsmodels.graphics.mosaicplot import mosaic

import myutilities as m

m.hide_toggle()

# Default of Credit Card Clients Dataset
There are 25 variables:

* ID: ID of each client
* LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* SEX: Gender (1=male, 2=female)
* EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* MARRIAGE: Marital status (1=married, 2=single, 3=others)
* AGE: Age in years
* PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
* PAY_2: Repayment status in August, 2005 (scale same as above)
* PAY_3: Repayment status in July, 2005 (scale same as above)
* PAY_4: Repayment status in June, 2005 (scale same as above)
* PAY_5: Repayment status in May, 2005 (scale same as above)
* PAY_6: Repayment status in April, 2005 (scale same as above)
* BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
* BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
* BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
* BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
* BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
* BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
* PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
* PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
* PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
* PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
* PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
* PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
* default.payment.next.month: Default payment (1=yes, 0=no)

In [None]:
df = pd.read_csv('input/UCI_Credit_Card.csv')
#ucd = df.copy()
dict_default = {1:"Sì", 0:"No"}
df["default.payment.next.month"].replace(dict_default, inplace=True)
df = df[['SEX','EDUCATION','MARRIAGE','AGE','default.payment.next.month']]

bins = [0, 25, 40, 100]
labels = ["1-25","25-40","40-100"]
df['AGE-bin'] = pd.cut(df['AGE'], bins=bins, labels=labels)
df.head()


In [None]:
cat_decoding_dict = {"SEX": {1: "Maschio", 2: "Femmina"},
            "EDUCATION": {1:"graduate school", 2:"university", 3:"high school", 4:"others", 5:"unknown", 6:"unknown"},
            "MARRIAGE": {1:"married", 2:"single", 3:"others"}
           }

protected_cat = ['SEX','EDUCATION','MARRIAGE','AGE','AGE-bin']
target_column = 'default.payment.next.month'

for category in protected_cat:
    if(category not in ['AGE','AGE-bin']): df[category].replace(cat_decoding_dict[category], inplace=True)
    s = df[category].value_counts() ## Counts the occurrence of unqiue elements and stores in a variable called "s" which is series type
    count = pd.DataFrame({'Età':s.index, 'Individui':s.values}) 
    if(category is not 'AGE'): 
        m.bar_plot(s, category)
    else:
        plot = px.bar(count,x = 'Età', y = 'Individui', color = 'Individui',color_continuous_scale="Viridis")
        fig = go.Figure(data=plot)
        fig.update_layout(title_text="Frequenza Per Età")
        fig.show()
    m.disproportion_index(s, category)
    raceXscore = df.groupby([category, target_column]).size()
    m.disproportion_index(raceXscore, '{} + {}'.format(category,target_column))

## Chi-Square test
The Chi Square statistic is commonly used for testing relationships between categorical variables. 
The null hypothesis of the Chi-Square test is that no relationship exists on the categorical variables in the population (they are independent). The Chi-Square statistic is most commonly used to evaluate Tests of Independence when using a crosstabulation (also known as a bivariate table).  

Crosstabulation presents the distributions of two categorical variables simultaneously, with the intersections of the categories of the variables appearing in the cells of the table. The Test of Independence assesses whether an association exists between the two variables by comparing the observed pattern of responses in the cells to the pattern that would be expected if the variables were truly independent of each other. Calculating the Chi-Square statistic and comparing it against a critical value from the Chi-Square distribution allows the researcher to assess whether the observed cell counts are significantly different from the expected cell counts.

### p-value 
It give us a probability of finding some association by chance in the case of independence of variables (association strength = 0). Very small p-value, usually lower than 0.05, tell us the null-hypothesis is rejected, so the considered variables are related and not independent.

### Pearson standardized residuals 
The strength of a relation can be thought as a measure of how much the observed values deviate from the values in case of independence (independence = total missing of any kind of relation among variables). The Pearson standardized residuals measure the departure of each cell from independence.

In [None]:
for category in ['SEX','EDUCATION','MARRIAGE']:
    m.chi2_pearson(category, df, 'default.payment.next.month')