In [2]:
import pandas as pd
import numpy as np

# Define sheet name
FILE_NAME = '../Sources/Data.xlsx'

# Get all sheet name in Excel file
sheet_name_list = pd.ExcelFile(FILE_NAME).sheet_names

# Show all sheet name and index number
for index, sheet_name in enumerate(sheet_name_list):
    print(f"{index} : {sheet_name}")

# Read data base on selected SELECT_SHEET_NAME
SELECT_SHEET_NAME_1 = sheet_name_list[4]

# add dataframe for selected sheet
df = pd.read_excel(FILE_NAME, sheet_name= SELECT_SHEET_NAME_1, skiprows = 5)

0 : Index
1 : Vol. of Plastic card usage
2 : Value. of Plastic card usage
3 : Payment Cards on ATM & POS 
4 : Mobile,internet banking
5 : e-Money
6 : promptpay


In [3]:
### Data Cleansing ###

# Change column name and drop un-related column
df.rename(columns = {"Unnamed: 1" : "Attribute"}, inplace = True)
df.dropna(subset = ["Attribute"], inplace = True)
df.drop(columns = ["Unnamed: 0"], inplace = True)
df.replace("n.a.", np.nan, inplace = True)

# Cleansing Column name (trim and remove leeter after year)
col_list_clean = []
col_list_trim = [col_name.strip() for col_name in df.columns]
for col_name in col_list_trim:
    if len(col_name.split(" ")) <= 2:
        col_list_clean.append(col_name)
    else:
        cleaned_col_name =  " ".join(col_name.split(" ")[:-1])
        col_list_clean.append(cleaned_col_name)
        
# Overwrite existing column with new columns clean column    
df.columns = col_list_clean



In [4]:
### Data Cleansing 2 ###

# Remove /Number from 'Attribute' columns
text_clean_list = []
for text in list(df['Attribute']):
    if text.split(" ")[-1].endswith("/"):
        text_clean_list.append(" ".join(text.split(" ")[:-1]))
    else:
        text_clean_list.append(text)
        
# replace value
for i in range(len(text_clean_list)):
    if text_clean_list[i].startswith("   No."):
        text_clean_list[i] = '   agreements'
    if text_clean_list[i].startswith("   Volume"):
        text_clean_list[i] = '   volume_k'
    if text_clean_list[i].startswith("   Value"):
        text_clean_list[i] = '   value_b'

df['Attribute'] = text_clean_list

In [5]:
### Data Shaping1 ###
# Seperate each attribute as parent and sub-topic

result_text_list = []
Parent = []

for index in range(len(text_clean_list)):
    list_text = text_clean_list[index].split("   ")
    
    if len(list_text) == 1:
        Parent = [list_text[0].strip()]
        result_text_list.append(" ::: ".join(Parent))
    else:
        Parent = Parent[:len(list_text)-1]
        Parent.append(list_text[len(list_text)-1].strip())
        result_text_list.append(" ::: ".join(Parent))
        
# Assign new value to 'Attribute' column 
df['Attribute'] = result_text_list

# Expland Attribute into multiple columns depend on their hireachy
prefix_col = 'Attribute_'
df = df['Attribute'].str.split(" ::: ", expand = True).add_prefix(prefix_col).join(df)

# drop Attribute column
df.drop(columns = ["Attribute"], inplace = True)

In [6]:
### Data Shaping 2 ###
# remove head(total) row

# get attribute list
selected_col = [col for col in df if col.startswith('Attribute_')]

# identify head(total) row
Row_to_Remove = []
for i in range(len(selected_col)):
    Attribute = 'Attribute_' + str(i)
    for j in range(len(df)):
        
        cur_val = df[Attribute][j]
        if j < len(df)-1:
            next_val = df[Attribute][j+1]
        else:
            next_val = ""
        if j > 0:
            prev_val = df[Attribute][j-1]
        else:
            prev_val = ""
    
        if cur_val != None and cur_val != prev_val and cur_val == next_val:
            Row_to_Remove.append(j)

# remove row head(total) row
df.drop(labels=Row_to_Remove,axis=0,inplace=True)

# reset row index
df.reset_index(inplace=True,drop=True)

In [7]:
### Data Shaping 3 ###

# list Attribute and Measure column
attribute_col_list = [col_name for col_name in df.columns if col_name.startswith(prefix_col)]
measure_col_list = [col_name for col_name in df.columns if col_name not in attribute_col_list]

# Remove blank value row (header column)
df.dropna(axis=0, subset=measure_col_list, how='all', inplace = True)

# Unoivot MonthYear Column
df = pd.melt(df, id_vars = attribute_col_list, value_vars = measure_col_list)

# rename column
df.rename(columns = {"Attribute_0" : "ChannelType" , "Attribute_1" : "MeasureName", "Attribute_2" : "UsageType"}, inplace = True)
df.rename(columns = {"variable" : "MonthYear" , "value" : "M_Value"}, inplace = True)

# add column Measre
df['TransType'] = 'Online Banking'

In [8]:
# save output to csv
df.to_csv(f'../Clean/Mobile_internet_banking_Clean.csv', index = False)
