In [12]:
# Standard Libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import colorcet as cc
import textwrap

# Date & Time
import datetime as dt
import calendar
from calendar import monthrange

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# Statistics & Maths
import math
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import MinMaxScaler
import ruptures as rpt

# Functions
import sys
import os

# Add parent directory + SCRIPTS to the import path
sys.path.append(os.path.abspath("../SCRIPTS"))

from DATA_ANALYSIS import DESCRIBE_VARIABLE, DAY_TYPE_LABELLING, TEMPORAL_AGGREGATION, RUN_STAT_TEST, MAP_STATION_COORDINATES
from REPORT_MANAGEMENT import REGISTER_REPORT_COMPONENT, SAVE_PLOT_AND_REGISTER, GENERATE_REPORT

In [13]:
# Dataset
CASH_FLOW = pd.read_csv('../DATA/RAW/CASH_FLOW_01082018_31072019.csv')
CASH_FLOW.head()

Unnamed: 0,Date,Income/Expenses,Category,Memo,Amount
0,2019-05-28,Expenses,Telephone,"Top Up Singtel Card @7-Eleven, Clementi",-10.0
1,2019-05-28,Expenses,Gift,,-15.3
2,2019-05-28,Expenses,Drinks,"Bubble Tea @Koi, Takashimaya, Orchard",-1.8
3,2019-05-28,Expenses,Gift,,-32.0
4,2019-05-28,Expenses,Food,"Korean BBQ @I Am Kim, Rochor",-15.0


In [14]:
CASH_FLOW.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             622 non-null    object 
 1   Income/Expenses  622 non-null    object 
 2   Category         622 non-null    object 
 3   Memo             584 non-null    object 
 4   Amount           622 non-null    float64
dtypes: float64(1), object(4)
memory usage: 24.4+ KB


In [15]:
print(CASH_FLOW["Date"].dtype)

object


In [16]:
# CASH_FLOW = CASH_FLOW[CASH_FLOW["Date"].notna()]
# CASH_FLOW

In [17]:
# Dataset Description
CASH_FLOW.describe()

Unnamed: 0,Amount
count,622.0
mean,1.656672
std,39.005284
min,-88.0
25%,-5.3
50%,-3.9
75%,-2.8
max,400.0


In [18]:
EXPECTED_COLUMNS = ["Date", "Category", "Memo", "Income/Expenses", "Amount"]
for col in EXPECTED_COLUMNS:
    if col not in CASH_FLOW.columns:
        raise ValueError(f"Missing expected column: {col}")

In [19]:
SPLIT_COLS = CASH_FLOW["Memo"].str.split('@', n=1, expand=True)
CASH_FLOW["Item"] = SPLIT_COLS[0].str.strip()
CASH_FLOW["Location"] = SPLIT_COLS[1].str.strip().fillna("Blank")

In [20]:
CASH_FLOW.head()

Unnamed: 0,Date,Income/Expenses,Category,Memo,Amount,Item,Location
0,2019-05-28,Expenses,Telephone,"Top Up Singtel Card @7-Eleven, Clementi",-10.0,Top Up Singtel Card,"7-Eleven, Clementi"
1,2019-05-28,Expenses,Gift,,-15.3,,Blank
2,2019-05-28,Expenses,Drinks,"Bubble Tea @Koi, Takashimaya, Orchard",-1.8,Bubble Tea,"Koi, Takashimaya, Orchard"
3,2019-05-28,Expenses,Gift,,-32.0,,Blank
4,2019-05-28,Expenses,Food,"Korean BBQ @I Am Kim, Rochor",-15.0,Korean BBQ,"I Am Kim, Rochor"


In [21]:
LOCATION_CHECK = []
for i in CASH_FLOW["Location"]: LOCATION_CHECK.append(str(i).count(",") + 1)
CASH_FLOW.insert(CASH_FLOW.shape[1],"Location Check", LOCATION_CHECK, True)

# Fill Empty Location With "Blank, Blank, Blank"
CASH_FLOW = CASH_FLOW.fillna("Blank")
CASH_FLOW.loc[CASH_FLOW["Location"] == "Blank", "Location"] = "Blank, Blank, Blank"