In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# import dataset and load in a dataframe
customer_file = r'..\data\customers_tm1_e.csv'
transaction_file = r'..\data\transactions_tm1_e.csv'
cust_df = pd.read_csv(customer_file)
trans_df = pd.read_csv(transaction_file)

In [None]:
cust_df.head()

In [None]:
trans_df.head()

In [None]:
cust_df.info()

In [None]:
trans_df.info()

In [None]:
is_missing = trans_df.isna()

missing_per_column = is_missing.sum(axis = 0) 
print(missing_per_column)              

In [None]:
# replacing empty start balances with 0, but this should be edited with transaction data if the information is there.
cust_df['start_balance'].fillna(0, inplace=True)

In [None]:
print(f"Customer columns:\n{list(cust_df.columns)}\n")
print(f"Transactions columns:\n{list(trans_df.columns)}")

In [None]:
# Checking if amount is equal to the sum of deposit and withdrawal
trans_df['amount'].equals(trans_df["deposit"] + trans_df["withdrawal"])

In [None]:
#Since it doesn't match, I am going to copy the table and see if I can find anything interesting about it
ttemp_df = trans_df.copy()

In [None]:
ttemp_df["transaction_sum"] = ttemp_df["withdrawal"] + ttemp_df["deposit"]
ttemp_df["transaction_diff"] = ttemp_df["amount"] - ttemp_df["transaction_sum"]
filter1 = ttemp_df[ttemp_df["transaction_diff"] != 0]

In [None]:
# 18 rows have aounts that don't equal the sum of the withdrawal and deposit
filter1.count()

In [None]:
# The rows with a difference between the amount and transaction sum, 
# Have amounts that are significantly different from the transaction sum
display(filter1[['customer_id', 'transaction_date', 'amount', 'transaction_sum', 'transaction_diff']])

In [None]:
# Option 1 is to adjust amount so that it matches the transaction sum
ttemp_df["amount"] = np.where(ttemp_df["transaction_diff"] != 0, ttemp_df["transaction_sum"], ttemp_df["amount"])

In [None]:
ttemp_df["transaction_diff"] = ttemp_df["amount"] - ttemp_df["transaction_sum"]
filter1 = ttemp_df[ttemp_df["transaction_diff"] != 0]
filter1.count()

In [None]:
# The other option would be be to change the withdrawal and deposits to match

In [None]:
#Adding an account_total column, by grouping by customer_id and sorting by transaction_date
trans_df['account_total'] = trans_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

In [None]:
#Casting to date type
trans_df['date'] = pd.to_datetime(trans_df['date'])
trans_df['transaction_date'] = pd.to_datetime(trans_df['transaction_date'])

In [None]:
month_trans = trans_df.groupby(['customer_id', trans_df.transaction_date.dt.to_period("m")])["amount"].sum()

In [None]:
merge_df = pd.merge(trans_df[['account_id', 'customer_id', 'transaction_date', 'amount', 'account_total']], cust_df, on = 'customer_id', how = 'left')

In [None]:
display(merge_df.head())


In [None]:
sum_df = merge_df[['amount', 'start_balance']].sum(axis=1)

In [None]:
# creating temp table that is grouped by month
# temp_df = trans_df.groupby(['customer_id', 'account_id',
#                            trans_df.transaction_date.dt.to_period("M")], as_index = False).agg({'amount': sum, 'deposit': sum, 'withdrawal': sum})

# Use temp table below
temp_df = trans_df.groupby(['customer_id', 'account_id',
                            pd.Grouper(key = 'transaction_date', freq = "M")], 
                            as_index= False).agg({'amount': sum, 
                                                  'deposit': sum, 
                                                  'withdrawal': sum})

In [None]:
print(temp_df)

In [None]:
temp_df.info()

In [None]:
# Merging data to look at each customer by month
tmer_df = pd.merge(cust_df, temp_df, on = 'customer_id', how = 'left')

In [None]:
tmer_df['transaction_total'] = tmer_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

In [None]:
tmer_df['account_total'] = tmer_df['transaction_total'] + tmer_df['start_balance']

In [None]:
tmer_df['account_total']

In [None]:
# Creating a copy of the temp merge df to clean up a bit
monthly_look_df = tmer_df.copy()

In [None]:
monthly_look_df.info()

In [None]:
# Cleaning up this temp table, so it just tells us about the account balance and transactions
monthly_look_df.drop(columns= ['dob', 'state', 'withdrawal', 'deposit'], inplace= True)
monthly_look_df["transaction_date"] = monthly_look_df.transaction_date.dt.to_period("M")
monthly_look_df = monthly_look_df.rename(columns={'transaction_date': "transaction_month", "amount": "transaction_amount"})

In [None]:
monthly_look_df.head()