In [1]:
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as st
# from scipy.stats import norm, chi2
# import statsmodels.api as sm

# Import Data
df = pd.read_csv("../data/13222067_cleaned.csv")
df.head()

Unnamed: 0,jenis_kelamin,usia,pendidikan,pekerjaan,penghasilan,domisili,durasi_internet,aktivitas_online_meningkat,aktivitas_online_meningkat_3bln,layanan_aktif_1,...,bayar_online_5,bayar_online_6,bayar_online_7,bayar_online_8,keluhan_online_1,keluhan_online_2,keluhan_online_3,keluhan_online_4,keluhan_online_5,keluhan_online_6
0,Pria,47,SMA,Pekerja Serabutan,Rp 2 juta – Rp 5 juta,bandung,3.0,Sama saja,,Mobile Banking,...,,,Melalui minimarket,,Barang yang diperoleh tidak sesuai dengan spes...,Barang rusak/ salah tetapi tidak dapat dikemba...,,,,
1,Wanita,19,SMA,Pelajar / Mahasiswa,< Rp 2 juta,surabaya,10.0,Ya,Keperluan mengerjakan tugas,Mobile Banking,...,,,,,,,,,,
2,Pria,50,S1,Karyawan Swasta,Rp 5 juta – Rp 10 juta,jakarta,6.0,Ya,Melakukan video conference,Mobile Banking,...,,,,,,,,,,
3,Wanita,19,SMA,Pelajar / Mahasiswa,< Rp 2 juta,bandung,5.0,Ya,Mengikuti kelas online,Mobile Banking,...,,,,,Barang yang diperoleh tidak sesuai dengan spes...,Barang rusak/ salah tetapi tidak dapat dikemba...,,,,
4,Pria,28,S1,Karyawan Swasta,Rp 5 juta – Rp 10 juta,bandung,9.0,Sama saja,,Mobile Banking,...,Transfer via ATM,,,,,,Pembayaran sudah dilakukan; barang tidak tersedia,,Pembayaran telah dilakukan tetapi tidak terdet...,


In [2]:

def countToSeries(df, column, normalize=True, threshold=0, otherLabel="lain"):
    value_count = df[column].value_counts(normalize=normalize)
    total = value_count.sum()
    newSeries = {}
    for idx, val in value_count.items():
        if(val/total < threshold/100):
            if(otherLabel in newSeries):
                newSeries[otherLabel] += val
            else:
                newSeries[otherLabel] = 0
        else:
            newSeries[idx] = val
    series1 = pd.Series(newSeries)
    return series1

def multipleChoiceToSeries(df, column_list, normalize=True):
    value_counts = {}
    total = df[column_list[0]].shape[0]
    for col in column_list:
        value_count = df[col].value_counts(dropna=True) 
        key = value_count.keys().tolist()[0]
        val = value_count.tolist()[0]
        if(normalize):
            value_counts[key] = val/total
        else:
            value_counts[key] = val
    value_counts_series = pd.Series(value_counts)
    return value_counts_series

In [None]:

# Confidence Interval One Sample
def ci_p(phat, n, alpha=0.05):
    z_halfAlpha = st.norm.ppf(1-(alpha/2))
    marginErr = z_halfAlpha((phat*(1-phat)/n)**0.5)
    lb = phat-marginErr
    ub = phat+marginErr
    if(lb<phat and phat<ub):
        conclusion = "Fail to Reject H0"
    else:
        conclusion = "Reject H0"

    ans = "Z-Value: {}\nConfidence Interval: {} < p < {}\nConclusion: {}".format(z_halfAlpha, lb, ub, conclusion)
    return ans

# Confidence Interval One Sample (Series)
def ci_p_series(phat_series, n, alpha=0.05):
    z_halfAlpha = st.norm.ppf(1-(alpha/2))
    lbDict = {}
    ubDict = {}
    for key, phat in phat_series.items():
        marginErr = z_halfAlpha*((phat*(1-phat)/n)**0.5)
        lbDict[key] = phat-marginErr
        ubDict[key] = phat+marginErr
    return (z_halfAlpha, pd.concat([phat_series.rename("Proportion"), pd.Series(lbDict, name="Lower"), pd.Series(ubDict, name="Upper")], axis=1).reset_index())

# Chi Square Goodness of Fit
def chiSqGoF(series, alpha=0.05): # Basically.. Is this series uniform???
    # Calculating
    n = len(series)
    nsum = series.sum()
    Ei = 1/len(series)*nsum
    chisq = 0
    for key, Oi in series.items():
        oiei2ei = ((Oi-Ei)**2 / Ei)
        chisq += oiei2ei
    chi2_alpha = st.chi2.ppf(1-alpha, n-1)
    pval = (1 - st.chi2.cdf(chisq, n-1))
    
    # Conclusion
    if(chisq > chi2_alpha):
        conclusion = "Fail to Reject H0"
    else:
        conclusion = "Reject H0"
    ans = "P-Value: {}\nchi2: {}\n Critical Region: chi2 > {}\nConclusion: {}".format(pval, chi2, chi2_alpha, conclusion)

    return ans

# Confidence Interval On Sample
def ci_p2(x1, x2, n1, n2, alpha=0.05):
    phat = (x1+x2)/(n1+n2)
    phat1 = x1/n1
    phat2 = x2/n2
    z_halfAlpha = st.norm.ppf(1-(alpha/2))
    z = (phat1-phat2)/(phat*(1-phat)*(1/n1 + 1/n2))**0.5

    lb = -z_halfAlpha
    ub = z_halfAlpha
    if(lb<phat and phat<ub):
        conclusion = "Fail to Reject H0"
    else:
        conclusion = "Reject H0"

    ans = "Z-Value: {}\nConfidence Interval: {} < p1-p2 < {}\nConclusion: {}".format(lb, ub, conclusion)
    return 

# Chi Square Independence
def chiSqTest(df_cont, alpha=0.05): # Basically, Is the value of A affects B?
    Ei = df_cont.copy()
    for s in df_cont.index:
        for o in df_cont.columns:
            Ei.loc[s,o] = df_cont.loc[s].sum() * df_cont.loc[:,o].sum() / df_cont.values.sum()
        
    row, col = df_cont.shape
    df = (row-1)*(col-1)
    chi2 = ((df_cont - Ei)**2/Ei).sum().sum()
    chi2_alpha = st.chi2.ppf(1-alpha, df)
    pval = 1 - st.chi2.cdf(chi2, df)

    # Conclusion
    if(chi2 > chi2_alpha):
        conclusion = "Fail to Reject H0"
    else:
        conclusion = "Reject H0"
    ans = "P-Value: {}\nchi2: {}\n Critical Region: chi2 > {}\nConclusion: {}".format(pval, chi2, chi2_alpha, conclusion)