In [1]:
import pandas as pd 

def load_data(fname):
    
    """
    Ini adalah fungsi untuk membaca data

    Parameter
    -fname: str
    Path yang menunjukkan posisi file

    Return
    data  : DataFrame 
    berisi kolom dan baris data
    
    """
    
    data = pd.read_csv(fname)
    print(f"Data Shape: {data.shape}")

    return data

FNAME = './data/raw/credit_risk_dataset.csv'

data = load_data(fname = FNAME)

data.head()

Data Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [2]:
def split_input_output(data, target_col):
    
    """
    Memisahkan data input menjadi fitur (X) dan target (y).

    Parameter
    - data       : DataFrame
    Data input yang berisi fitur dan target.
    
    - target_col : string
    Nama kolom yang digunakan sebagai variabel target.

    Return
    - X : DataFrame
    Data fitur
    
    - y : Series
    Data target
    
    """
        
    X = data.drop(target_col, axis = 1)
    y = data[target_col]
    
    print(f'Original data shape: {data.shape}')
    print(f'X data shape       : {X.shape}')
    print(f'y data shape       : {y.shape}')
    
    return X, y


TARGET_COL = 'loan_status'

X, y = split_input_output(data = data, target_col = TARGET_COL)


Original data shape: (32581, 12)
X data shape       : (32581, 11)
y data shape       : (32581,)


In [3]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size, random_state = None):
    
    """
    Memisahkan data X dan y menjadi data train dan data test.

    Parameter
    - X           : DataFrame
    Data input yang berisi fitur.
    
    - y           : Series
    Data target yang berisi target.
    
    - test_size   : float
    Besaran proporsi untuk pembagian train dan test
    
    - random_state: int
    Angka acak agar pembagian data konsisten setiap kali fungsi dijalankan.

    Return
    - X_train, X_test, y_train, y_test : DataFrame
    Data train dan test untuk fitur dan target
    
    """
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state, stratify = y)
    print('X train shape:', X_train.shape)
    print('X test shape :', X_test.shape)
    print('y train shape:', y_train.shape)
    print('y test shape :', y_test.shape)

    return X_train, X_test, y_train, y_test
    


In [4]:
X_train, X_not_train, y_train, y_not_train = split_train_test(X = X, y = y, test_size = 0.2, random_state = 42)


X train shape: (26064, 11)
X test shape : (6517, 11)
y train shape: (26064,)
y test shape : (6517,)


In [5]:
X_valid, X_test, y_valid, y_test = split_train_test(X = X_not_train, y = y_not_train, test_size = 0.5, random_state = 42)

X train shape: (3258, 11)
X test shape : (3259, 11)
y train shape: (3258,)
y test shape : (3259,)


In [6]:
import joblib


def serialize_data(data, path): 
    
    """
    Buat serial data ke penyimpanan yang ditentukan menggunakan joblib.
    
    Parameter
    - data   : DataFrame
    Data yang akan diserialkan.
    
    - path   : str
    lokasi tempat data serial akan disimpan.
    
    Return
    - no name: str
    lokasi file tempat data diserialkan.
    """
    return joblib.dump(data, path)

In [7]:
serialize_data(X_train, './data/interim/X_train.pkl')


['./data/interim/X_train.pkl']

In [8]:
serialize_data(y_train, './data/interim/y_train.pkl')


['./data/interim/y_train.pkl']

In [9]:
serialize_data(X_test, './data/interim/X_test.pkl')


['./data/interim/X_test.pkl']

In [10]:
serialize_data(y_test, './data/interim/y_test.pkl')


['./data/interim/y_test.pkl']

In [11]:
serialize_data(X_valid, './data/interim/X_valid.pkl')


['./data/interim/X_valid.pkl']

In [12]:
serialize_data(y_valid, './data/interim/y_valid.pkl')

['./data/interim/y_valid.pkl']

In [13]:
def deserialize_data(path):
    """
    Deserialisasi data dari lokasi yang ditentukan dengan menggunakan joblib.
    
    Parameter
    path : str
    Lokasi data yang akan dideserialisasikan.
    
    Return
    data : DataFrame 
    Data yang sudah diserialisasikan.
    """
    data = joblib.load(path)
    return data