In [1]:
import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Read the Parquet file into a DataFrame
df = pd.read_parquet(r'../Data/Np_cleaned.parquet')

In [3]:
# View Data
print(df.shape)
display(df.dtypes)

(6501, 26)


TicketNo                       object
fld_BranchCode                 object
fld_RegionCode                 object
fld_AreaCode                   object
fld_BranchName                 object
PrincipalAmt                  float64
LoanDate               datetime64[ns]
DateCreated            datetime64[ns]
ExpiryDate             datetime64[ns]
TransStatus                    object
FormNo                         object
StorageGroupName               object
DesignModelName                object
fld_Weight                    float64
fld_Karat                      object
fld_ItemDescription            object
fld_StorageGroupID             object
fld_AppraiseValue             float64
PromoCode                      object
PromoReason                    object
InterestNewPrenda             float64
fld_CustomerNo                 object
new_PTStarRating              float64
fld_FirstName                  object
fld_MiddleName                 object
fld_LastName                   object
dtype: objec

In [4]:
# Parsing
df['datetime'] = pd.to_datetime(df['DateCreated'])
df['time_numeric'] = df['datetime'].dt.hour * 3600 \
                     + df['datetime'].dt.minute * 60 + df['datetime'].dt.second
df['date_numeric'] = (df['datetime'] - pd.to_datetime('1990-01-01')).dt.days

# Encoding One Hot
promo_one_hot = pd.get_dummies(df['PromoCode'], prefix='Promo')
status_one_hot = pd.get_dummies(df['TransStatus'], prefix='Status')

# Encoding Labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['client_label_encoded'] = label_encoder.fit_transform(df['fld_CustomerNo'])

In [5]:
X = df[['time_numeric','date_numeric', 'client_label_encoded']]
X = pd.concat([X, promo_one_hot, status_one_hot], axis=1)

In [6]:
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(X)
pq.write_table(table, r'..\Data\X.parquet')