In [1]:
import pandas as pd
import numpy as np

# Define sheet name
FILE_NAME = '../Sources/Data.xlsx'

# Get all sheet name in Excel file
sheet_name_list = pd.ExcelFile(FILE_NAME).sheet_names

# Show all sheet name and index number
for index, sheet_name in enumerate(sheet_name_list):
    print(f"{index} : {sheet_name}")

# Read data base on selected SELECT_SHEET_NAME
SELECT_SHEET_NAME_1 = sheet_name_list[14]

# add dataframe for selected sheet
df = pd.read_excel(FILE_NAME, sheet_name= SELECT_SHEET_NAME_1, skiprows = 5)

0 : Index
1 : Vol. of Txn by Payment System
2 : Value of Txn by Payment System
3 : Vol. of Plastic card usage
4 : Value. of Plastic card usage
5 : number of cards
6 : No of EFTPOS Terminals <2018
7 : No of EFTPOS Terminals 2018+
8 : Payment Cards on ATM & POS 
9 : Mobile,internet banking
10 : Mobile,internet banking add
11 : Bill Payment
12 : Vol. of e-payment
13 : Value. of e-payment
14 : e-Money
15 : Foreign cards & emoney
16 : Thai cards spent abroad
17 : promptpay


In [2]:
### Data Cleansing ###

# Change column name and drop un-related column
df.rename(columns = {"Unnamed: 1" : "Attribute"}, inplace = True)
df.dropna(subset = ["Attribute"], inplace = True)
df.drop(columns = ["Unnamed: 0"], inplace = True)
df.replace("n.a.", np.nan, inplace = True)

# Filter out Total row of column 'Attribute'
df = df[df['Attribute'].str.contains("Total") == False]

# Cleansing Column name (trim and remove leeter after year)
col_list_clean = []
col_list_trim = [col_name.strip() for col_name in df.columns]
for col_name in col_list_trim:
    if len(col_name.split(" ")) <= 2:
        col_list_clean.append(col_name)
    else:
        cleaned_col_name =  " ".join(col_name.split(" ")[:-1])
        col_list_clean.append(cleaned_col_name)
        
# Overwrite existing column with new columns clean column    
df.columns = col_list_clean


In [3]:
### Data Cleansing 2 ###

# Remove /Number from 'Attribute' columns
text_clean_list = []
for text in list(df['Attribute']):
    if text.split(" ")[-1].endswith("/"):
        text_clean_list.append(" ".join(text.split(" ")[:-1]))
    else:
        text_clean_list.append(text)
        
# replace value
for i in range(len(text_clean_list)):
    if text_clean_list[i].startswith("No."):
        text_clean_list[i] = 'Cards_Account'
    if text_clean_list[i].startswith("Topup"):
        text_clean_list[i] = 'Topup_Value'
    if text_clean_list[i].startswith("Spending"):
        text_clean_list[i] = 'Spending_Value'

df['Attribute'] = text_clean_list

In [4]:
### Data Shaping1 ###
# Seperate each attribute as parent and sub-topic

result_text_list = []
Parent = []

for index in range(len(text_clean_list)):
    list_text = text_clean_list[index].split("   ")
    
    if len(list_text) == 1:
        Parent = [list_text[0].strip()]
        result_text_list.append(" ::: ".join(Parent))
    else:
        Parent = Parent[:len(list_text)-1]
        Parent.append(list_text[len(list_text)-1].strip())
        result_text_list.append(" ::: ".join(Parent))
        
# Assign new value to 'Attribute' column 
df['Attribute'] = result_text_list

# Expland Attribute into multiple columns depend on their hireachy
prefix_col = 'Attribute_'
df = df['Attribute'].str.split(" ::: ", expand = True).add_prefix(prefix_col).join(df)

# drop Attribute column
df.drop(columns = ["Attribute"], inplace = True)

In [5]:
### Data Shaping 2 ###

# list Attribute and Measure column
attribute_col_list = [col_name for col_name in df.columns if col_name.startswith(prefix_col)]
measure_col_list = [col_name for col_name in df.columns if col_name not in attribute_col_list]

# Remove blank value row (header column)
df.dropna(axis=0, subset=measure_col_list, how='all', inplace = True)

# Unoivot MonthYear Column
df = pd.melt(df, id_vars = attribute_col_list, value_vars = measure_col_list)

# rename column
df.rename(columns = {"Attribute_0" : "MeasureName" , "Attribute_1" : "BankType"}, inplace = True)
df.rename(columns = {"variable" : "MonthYear" , "value" : "M_Value"}, inplace = True)

# add column Measre
df['TransType'] = 'e-Money'

In [6]:
# save output to csv
df.to_csv(f'../Clean/e-Money_Clean.csv', index = False)
