In [12]:
### Data Cleaning
### Handling Null values, Parsing, Re-categorization
### by: Chester Hansel Duco, Philippines

In [13]:
import pandas as pd

In [14]:
# Read the Parquet file into a DataFrame
df = pd.read_parquet(r'..\Data\np_216.parquet')

In [15]:
# View Data
print(df.shape)
display(df.head())

(325307, 27)


Unnamed: 0,TicketNo,fld_BranchCode,fld_RegionCode,fld_AreaCode,fld_BranchName,PrincipalAmt,LoanDate,DateCreated,ExpiryDate,TransStatus,...,fld_StorageGroupID,fld_AppraiseValue,PromoCode,PromoReason,InterestNewPrenda,fld_CustomerNo,new_PTStarRating,fld_FirstName,fld_MiddleName,fld_LastName
0,61355SG011334,61355,2B,216,CLH Calumpit Pio Cruzcosa,1000.0,2022-01-11 16:55:00,2022-01-11 16:56:00,2022-05-13,Pulled-Out,...,13,1000.0,[None],,40.0,613550043993,,NENITA,FRANCISCO,ESPIRITU
1,61355SG009783,61355,2B,216,CLH Calumpit Pio Cruzcosa,3400.0,2021-05-18 09:30:00,2021-05-18 09:37:00,2021-09-17,Redeemed,...,13,4320.0,[None],,136.0,613550066718,,TRISTAN JAY,OFIAZA,ROXAS
2,20112SG037342,20112,2B,216,CLH Plaridel,2934.0,2020-01-25 16:31:00,2020-01-25 16:32:00,2020-05-25,Redeemed,...,13,2934.0,[None],,117.36,201120185053,,ELVIRA,G.,LIM
3,20112SG043933,20112,2B,216,CLH Plaridel,7000.0,2021-12-30 08:56:00,2021-12-30 08:57:00,2022-05-02,Redeemed,...,13,7250.0,[None],,280.0,201120373900,,MA NILA,TULOP,MERABIL
4,20112SG037072,20112,2B,216,CLH Plaridel,3215.0,2020-01-10 12:29:00,2020-01-10 12:31:00,2020-05-11,Redeemed,...,13,3352.25,[None],,128.6,201120252953,,ROSEMARIE,CUTLER,KATSUMATA


In [16]:
## Handling Missing Values

In [17]:
# Imputation
df['new_PTStarRating'] = df['new_PTStarRating'].fillna(0) #replace with zeroes

In [18]:
# Deleting Columns
df = df.drop('fld_SerialNo', axis=1)

In [19]:
# Check for null values
def check_nulls(_df):

    print('check nulls')
    _df_out = pd.DataFrame([])
    _df_out['null_count'] = _df.isna().sum()
    _df_out['null_count_percentage'] = _df.isna().sum()/_df.shape[0]
    display(_df_out[_df_out['null_count']>0])

check_nulls(df)

check nulls


Unnamed: 0,null_count,null_count_percentage
ExpiryDate,20,6.1e-05
fld_ItemDescription,8,2.5e-05


In [20]:
# Re-categorize
df['PromoCode'].replace({
    "NPPRD5": "PRD5",
    "NPPRD10": "PRD10",
    "NPPRD15": "PRD15",
    "NPPRD10": "PRD10"
}, inplace=True)
display(df['PromoCode'].value_counts())

df['TransStatus'].replace({
    "Pulled Out": "Pulled-Out",
    "SOLD": "Sold"
}, inplace=True)
display(df['TransStatus'].value_counts())

[None]             173548
NPJEWEL            103170
PRD10               20129
PRD15               14029
PRD5                 8835
RDMCQ1               2437
CLP1                 1706
RPAWN                1233
PRD1                  113
NPPRD200               27
RRDRNW8                24
NPGADGET               20
RDM1                   10
RRDUPSIZE              10
PRD500                  7
NPNONJEWEL              5
NPJEWEL EMPDISC         4
Name: PromoCode, dtype: int64

Redeemed      172804
Pulled-Out     57221
Sold           45145
Renewed        22293
New Prenda     22185
Expired         5659
Name: TransStatus, dtype: int64

In [21]:
# Top Clients
top = df.groupby('fld_CustomerNo').size().sort_values(ascending=False).head(20).index
display(top)
df = df[df['fld_CustomerNo'].isin(top)]
display(df.head())

Index(['603880052772', '201110475003', '201110433348', '209080012716',
       '603750427211', '201020210909', '617520014594', '201110640845',
       '603750011557', '201110599464', '603870274382', '201010543246',
       '201110526538', '9516304', '608750019480', '609490059965',
       '603880608276', '605070000976', '603880291782', '209070678298'],
      dtype='object', name='fld_CustomerNo')

Unnamed: 0,TicketNo,fld_BranchCode,fld_RegionCode,fld_AreaCode,fld_BranchName,PrincipalAmt,LoanDate,DateCreated,ExpiryDate,TransStatus,...,fld_StorageGroupID,fld_AppraiseValue,PromoCode,PromoReason,InterestNewPrenda,fld_CustomerNo,new_PTStarRating,fld_FirstName,fld_MiddleName,fld_LastName
20,61752SG002038,61752,2B,216,CLH Tarcan,21000.0,2020-09-08 14:29:00,2020-09-08 14:30:00.000,2021-01-08,Redeemed,...,7,24075.0,[None],,840.0,617520014594,0.0,GRACE,V.,CASTRO
203,20111SG094315,20111,2B,216,CLH Malolos 2,10755.0,2020-02-05 17:53:00,2020-02-05 17:54:00.000,2020-06-05,Redeemed,...,13,10755.14,RDMCQ1,covid,430.2,201110475003,0.0,LEONORA,SANTIAGO,BASALLO
286,61458SG004221,61458,2B,216,CLH Balagtas Saint Anne Square,3430.0,2020-01-03 13:45:00,2020-01-03 13:45:00.000,2020-05-04,Sold,...,13,3430.0,[None],,137.2,201110599464,0.0,CZARINA ANNE,ANGELES,ESGUERRA
322,20111SG0108668,20111,2B,216,CLH Malolos 2,5500.0,2021-10-15 13:50:00,2021-10-15 13:53:00.000,2022-02-14,Pulled-Out,...,13,5500.0,PRD10,4star,220.0,201110433348,4.0,MYLENE,MERCADO,GABRIEL
394,60388NP001832,60388,2B,216,CLH Malolos 1,53431.0,2022-11-02 00:00:00,2022-11-02 11:32:51.750,2023-03-03,Redeemed,...,13,46462.5,PRD15,,2137.24,603880608276,4.0,MA RACHEL,MATEO,VALENZUELA


In [22]:
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(df)
pq.write_table(table, r'..\Data\Np_cleaned.parquet')

