In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))


import pandas as pd

In [4]:
def load_data(fname):
    """
    The function to load dataset from CSV file.

    Parameters
    -----------
    fname: str
        File CSV location.

    Returns
    -------
    data: pandas.DataFrame
        Dataset that already loaded from CSV file.
    """

    data = pd.read_csv(fname)
    print(f"Data Shape:{data.shape}")
    return data

In [5]:
FNAME = "data/raw/credit_risk_dataset.csv"
data = load_data(FNAME)
data.head()

Data Shape:(32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [15]:
def split_input_output(data, target_col):
    """
    The function to split dataset into input (X) and ouput (y).

    Parameters
    ----------
    data: pandas.DataFrame
        Whole dataset that contain feature and target.
        
    target_col: str
        Target column name on dataset.

    Returns
    -------
    X: pandas.Dataframe
        Input dataset that contains feature.
    y: pandas.Series
        Output dataset that contains target.
    """

    X = data.drop(columns=[target_col])
    y = data[target_col]

    print(f"Original data shape: {data.shape}")
    print(f"X data shape: {X.shape}")
    print(f"y data shape: {y.shape}")

    return X, y

In [19]:
TARGET_COL = "loan_status"

In [21]:
X, y = split_input_output(data, TARGET_COL)

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


In [22]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size, random_state=None):
    """
    The function to split dataset into data train and data test.

    Parameters
    ----------
    X : pandas.DataFrame
        Dataset feature/input.
    y : padas.Series
        Dataset target/output.
    test_size: float
        Proportion of data used as test set.   
    random_state: int (default: None)
        Seed for random number generator to ensure 
        reproducible result.

    Returns
    -------
    X_train : pandas.DataFrame
        Data feature for training.
    X_test : pandas.DataFrame
        Data feature for testing.
    y_train : padas.Series
        Data target for training.
    y_test : padas.Series
        Data target for testing.
    """

    X_train, X_test, y_train, y_test = train_test_split (
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    print(f"X train shape: {X_train.shape}")
    print(f"X test shape: {X_test.shape}")
    print(f"y train shape: {y_train.shape}")
    print(f"y test shape: {y_train.shape}")

    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_non_train, y_train, y_non_train = split_train_test(
    X,
    y,
    test_size=0.2,
    random_state=42
)

X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064,)
y test shape: (26064,)


In [24]:
X_valid, X_test, y_valid, y_test = split_train_test(
    X_non_train,
    y_non_train,
    test_size=0.5,
    random_state=42
)

X train shape: (3258, 11)
X test shape: (3259, 11)
y train shape: (3258,)
y test shape: (3258,)


In [26]:
import joblib

def serialize_data(data, path):
    """
    The function to save serialize (object) Python into file.

    Parameters
    ----------
    data : object
        The Python object to be serialized (e.g. DataFrame, Series, array).
    path: str
        The location or file name where the object will be saved.

    Returns
    -------
    None
        This function does not return any value.
    """

    joblib.dump(data, path)

In [27]:
serialize_data(X_train, "X_train.pkl")
serialize_data(y_train, "y_train.pkl")

serialize_data(X_test, "X_test.pkl")
serialize_data(y_test, "y_test.pkl")

serialize_data(X_valid, "X_valid.pkl")
serialize_data(y_valid, "y_valid.pkl")

In [None]:
def deserialize_data(path):
    """
    The function to load (deserialized)
    Python objects from serialized files.

    Parameters
    ----------
    path : str
        The location or file name
        where the object is serialized.

    Returns
    -------
    data : object
        The deserialized Python object.
    """

    data = joblib.load(path)
    return data