=================================================

Non Graded Challenge: ML Problem Framing and Data Cleaning

Nama  : Gerwyn Zulqarnain
Batch : HCK-021

Program ini dibuat untuk menganalisa berapa biaya yang harus dibayarkan untuk mendapatkan asuransi kesehatan.

=================================================

### Introduction

SMART
<br>
S : Menghitung estimasi harga untuk asuransi kesehatan<br>
M : Dengan menggunakan 6 kolom sebagai feature untuk mendapatkan target<br>
A : Menggunakan machine learning<br>
R : Dengan menggunakan machine learning maka kita dapat mengetahui estimasi harga dari asuransi kesehatan<br>
T : Dalam waktu 7 hari dapat menyelesaikan machine learning<br>

problem statement :<br>
Kita ingin mengetahui berapa kira-kira estimasi harga dari asuransi kesehatan dengan melakukan input data dalam 6 kolom sebagai feature yang kemudian akan di proses oleh machine learning agar mendapatkan target.

#### Import Libraries

In [178]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split

#### Data Loading

In [179]:
data = pd.read_csv('https://raw.githubusercontent.com/FTDS-learning-materials/datasets/main/insurance.csv')

### Exploratory Data Analysis

In [180]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [181]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [182]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [183]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [184]:
data[data.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


### Feature Engineering

#### Duplicate Data

In [185]:
data_duplicate = data.copy()

In [186]:
def duplicate_remove(data):
    data = data.drop_duplicates()
    return data

#### Cardinality Check

In [187]:
def cardinality_check(data):
    cat_cols = list(data.select_dtypes(include='object').columns)
    listItem = []
    for col in cat_cols:
        listItem.append([col, data[col].nunique(), data[col].unique()])
    dataframe = pd.DataFrame(columns=['nama kolom', 'jumlah nilai unique', 'nilai unique'], data=listItem)
    return dataframe

#### Data Spliting

In [188]:
def data_split(feature, target, size, random):
    X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size = size, random_state=random)
    return X_train, X_test, y_train, y_test

#### Distribution Check

In [189]:
def skew_check(data):
    num_cols = list(data.select_dtypes(exclude='object').columns)
    listItem = []
    for col in num_cols:
        nilai_skew = data[col].skew()
        if -0.5 <= nilai_skew <= 0.5:
            listItem.append([col, round(nilai_skew, 2), "normal"])
        elif -1 < nilai_skew < -0.5 or 0.5 < nilai_skew < 1:
            listItem.append([col, round(nilai_skew, 2), "moderately skewed"])
        else:
            listItem.append([col, round(nilai_skew, 2), "highly skewed"])
    dataframe = pd.DataFrame(columns=['nama kolom', 'nilai skewness', 'distribusi'], data=listItem)
    return dataframe

#### Outliers Check

In [190]:
def outliers_check(data):
    num_cols = list(data.select_dtypes(exclude='object').columns)
    listItem = []
    for col in num_cols:
        listItem.append([col, round(data[col].skew(),1), np.where(
            (round(data[col].skew(),1) <= 0.5) & (round(data[col].skew(),1) >= -0.5),
            'normal',
            'skewed')])
    skewness = pd.DataFrame(columns=['nama kolom', 'nilai skewness', 'distribution'], data= listItem)
    column = []
    lower_bound = []
    upper_bound = []
    percent_total_outlier = []

    for row in range (0, len(skewness)):
        col = skewness['nama kolom'][row]
        #checking upper and lower boundary
        if skewness['distribution'][row] == 'skewed':
            IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
            lower_boundary = data[col].quantile(0.25) - (IQR * 3)
            upper_boundary = data[col].quantile(0.75) + (IQR * 3)
        else:
            lower_boundary = data[col].mean() - 3* data[col].std()
            upper_boundary = data[col].mean() + 3* data[col].std()
        #append to list
        column.append(col)
        lower_bound.append(lower_boundary)
        upper_bound.append(upper_boundary)
        totout = ((len(data[data[col] > upper_boundary]) / len(data) * 100) + (len(data[data[col] < lower_boundary]) / len(data) * 100))
        percent_total_outlier.append(totout)

    outliers = pd.DataFrame({
        'column': column,
        #round the value
        'upper_boundary': [round(upper_bound,2) for upper_bound in upper_bound],
        'lower_boundary': [round(lower_bound,2) for lower_bound in lower_bound],
        'percentage_total_outlier': [round(percent_total_outlier,2) for percent_total_outlier in percent_total_outlier]
    })
    return outliers


In [191]:
def handling_outliers(data):
    num_cols = list(data.select_dtypes(exclude='object').columns)
    for col in num_cols:
        nilai_skew = data[col].skew()
        if -0.5 <= nilai_skew <= 0.5:
            lower_boundary = data[col].mean() - 3* data[col].std()
            upper_boundary = data[col].mean() + 3* data[col].std()
            totout = ((len(data[data[col] > upper_boundary]) / len(data) * 100) + (len(data[data[col] < lower_boundary]) / len(data) * 100))
            if totout < 0.05:
                data = data.drop(data[(data[col] > upper_boundary) | (data[col] < lower_boundary)].index)
            else:
                data[col] = np.clip(data[col], lower_boundary, upper_boundary)
        else:
            IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
            lower_boundary = data[col].quantile(0.25) - (IQR * 3)
            upper_boundary = data[col].quantile(0.75) + (IQR * 3)
            totout = ((len(data[data[col] > upper_boundary]) / len(data) * 100) + (len(data[data[col] < lower_boundary]) / len(data) * 100))
            if totout < 0.05:
                data = data.drop(data[(data[col] > upper_boundary) | (data[col] < lower_boundary)].index)
            else:
                data[col] = np.clip(data[col], lower_boundary, upper_boundary)

    return data

#### Vizualitation

In [192]:
def diagnostic_plots(data):
    num_cols = list(data.select_dtypes(exclude='object').columns)
    for col in num_cols:
        plt.figure(figsize=(16, 4))

        plt.subplot(1, 2, 1)
        sns.histplot(data[col], bins=30)
        plt.title(f'Histogram \n {round(data[col].skew(),2)}')

        plt.subplot(1, 2, 2)
        sns.boxplot(y=data[col])
        plt.title('Boxplot')

        plt.show()

#### Correlation Check

In [193]:
def correlation_check(data, target):
    listItem = []
    cat_cols = list(data.select_dtypes(include='object').columns)
    
    for col in cat_cols:
        corr_tau, pval_k = stats.kendalltau(data[col], data[target])
        if 0.5 <= round(corr_tau,2) <= 1:
            listItem.append([col, round(corr_tau,2), round(pval_k,2), 'korelasi kuat'])
        else:
            listItem.append([col, round(corr_tau,2), round(pval_k,2), 'korelasi lemah'])
    num_cols = list(data.select_dtypes(exclude='object').columns)

    for col in num_cols:
        if col == target:
            pass
        elif -0.5 <= round(data[col].skew(), 2) <= 0.5:
            corr_r, pval_p = stats.pearsonr(data[col], y)
            if 0.5 <= round(corr_r,2) <= 1:
                listItem.append([col, round(corr_r,2), round(pval_p, 2), 'korelasi kuat'])
            else:
                listItem.append([col, round(corr_r,2), round(pval_p, 2), 'korelasi lemah'])
        else:
            corr_rho, pval_s = stats.spearmanr(data[col], y)
            if 0.5 <= round(corr_rho,2) <= 1:
                listItem.append([col, round(corr_rho,2), round(pval_s,2), 'korelasi kuat'])
            else:
                listItem.append([col, round(corr_rho,2), round(pval_s,2), 'korelasi lemah'])

    dataframe = pd.DataFrame(columns=['nama kolom', 'nilai korelasi', 'nilai p-value', 'keterangan'], data=listItem)
    return dataframe

In [194]:
df = data_duplicate[['smoker', 'age', 'bmi', 'children', 'charges']]

In [195]:
X = df.drop(['charges'], axis = 1)
y = df['charges']
X_train, X_test, y_train, y_test = data_split(X, y, 0.2, 0)

In [196]:
print('train size', X_train.shape)
print('Test size', X_test.shape)

train size (1070, 4)
Test size (268, 4)
