In [23]:
import pandas as pd 
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder
from sqlalchemy import create_engine
import pickle
from scipy import sparse



# read data from MySQL database
class Config:
    engine = create_engine('mysql+pymysql://root:123@localhost:3307/give_me_some_credit')
    train = 'cs_training_encoded'
    test = 'cs_test_encoded'

    encoded_train_file='cs_training_discrete_onehot.npz'
    encoded_test_file='cs_test_discrete_onehot.npz'
    encoder_file = r".\onehot_encoder.pkl"
    encoded_train_dict_file = r".\onehot_encode_dict.pkl"


In [32]:

def encode(mode:str='train')-> list[pd.DataFrame, dict]:
    if mode == 'train':
        df = pd.read_sql(f"select * from {Config.train}", Config.engine)

        encoder = OneHotEncoder(handle_unknown='ignore').fit(df)
        # save onehot encoder
        with open(Config.encoder_file, 'wb') as f:
            pickle.dump(encoder, f)
        # save encoded data 
        encoded_data = encoder.transform(df)
        sparse.save_npz(file=Config.encoded_train_file, matrix=encoded_data)
        # save encoded train dict 
        train_dict = encoder.get_feature_names_out(df.columns)
        with open(Config.encoded_train_dict_file, 'wb') as f:
            pickle.dump(train_dict, f)

    elif mode == 'test':
        df = pd.read_sql(f"select * from {Config.test}", Config.engine)
        
        with open(Config.encoder_file, 'rb') as f:
            encoder = pickle.load(f)
        encoded_data = encoder.transform(df)
        sparse.save_npz(Config.encoded_test_file, encoded_data)

    else:
        raise Exception('parameter mode is wrong!')
    

    return encoded_data

In [33]:
encode(mode='train')

<150000x303 sparse matrix of type '<class 'numpy.float64'>'
	with 1500000 stored elements in Compressed Sparse Row format>

In [34]:
encode(mode='test')

<101503x303 sparse matrix of type '<class 'numpy.float64'>'
	with 1015025 stored elements in Compressed Sparse Row format>