In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [133]:
# import dataset and load in a dataframe
customer_file = r'..\data\customers_tm1_e.csv'
transaction_file = r'..\data\transactions_tm1_e.csv'
cust_df = pd.read_csv(customer_file)
trans_df = pd.read_csv(transaction_file)

In [5]:
cust_df.head()

Unnamed: 0,customer_id,dob,state,start_balance,creation_date
0,91,1993-07-01,California,10180.56,2007-01-31
1,92,1985-12-05,New York,4757.68,2007-01-31
2,93,1987-11-19,Minnesota,6796.72,2007-01-31
3,94,1981-03-23,Minnesota,9870.48,2007-01-31
4,95,1970-04-06,California,12500.72,2007-01-31


In [32]:
trans_df.head()

Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,account_total
0,2007-01-31,24137947,91,3034.26,2007-01-31,3034.26,0.0,-2260.92
1,2007-01-31,24137947,91,-5295.18,2007-01-16,0.0,-5295.18,-5295.18
2,2007-02-28,24137947,91,0.0,2007-02-28,0.0,0.0,-2260.92
3,2007-03-31,24137947,91,-0.0,2007-03-30,0.0,-0.0,-2260.92
4,2007-03-31,24137947,91,-0.0,2007-03-11,0.0,-0.0,-2260.92


In [135]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116016 entries, 0 to 116015
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   customer_id    116016 non-null  int64  
 1   dob            116016 non-null  object 
 2   state          116016 non-null  object 
 3   start_balance  116013 non-null  float64
 4   creation_date  116016 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 4.4+ MB


In [134]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4977972 entries, 0 to 4977971
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   date              object 
 1   account_id        int64  
 2   customer_id       int64  
 3   amount            float64
 4   transaction_date  object 
 5   deposit           float64
 6   withdrawal        float64
dtypes: float64(3), int64(2), object(2)
memory usage: 265.9+ MB


In [136]:
is_missing = trans_df.isna()

missing_per_column = is_missing.sum(axis = 0) 
print(missing_per_column)              

date                0
account_id          0
customer_id         0
amount              0
transaction_date    0
deposit             0
withdrawal          0
dtype: int64


##### Replacing the Na in starting balance with 0

In [137]:
# replacing empty start balances with 0, but this should be edited with transaction data if the information is there.
cust_df['start_balance'].fillna(0, inplace=True)

In [138]:
print(f"Customer columns:\n{list(cust_df.columns)}\n")
print(f"Transactions columns:\n{list(trans_df.columns)}")

Customer columns:
['customer_id', 'dob', 'state', 'start_balance', 'creation_date']

Transactions columns:
['date', 'account_id', 'customer_id', 'amount', 'transaction_date', 'deposit', 'withdrawal']


##### Checking if amount is equal to the sum of deposit and withdrawal

In [139]:
trans_df['amount'].equals(trans_df["deposit"] + trans_df["withdrawal"])

False

In [171]:
#Since it doesn't match, I am going to copy the table and see if I can find anything interesting about it
ttemp_df = trans_df.copy()

In [172]:
ttemp_df["transaction_sum"] = ttemp_df["withdrawal"] + ttemp_df["deposit"]
ttemp_df["transaction_diff"] = ttemp_df["amount"] - ttemp_df["transaction_sum"]
filter1 = ttemp_df[ttemp_df["transaction_diff"] != 0]

In [173]:
# 18 rows have aounts that don't equal the sum of the withdrawal and deposit
filter1.count()

date                18
account_id          18
customer_id         18
amount              18
transaction_date    18
deposit             18
withdrawal          18
transaction_sum     18
transaction_diff    18
dtype: int64

In [174]:
# The rows with a difference between the amount and transaction sum, 
# Have amounts that are significantly different from the transaction sum
display(filter1[['customer_id', 'transaction_date', 'amount', 'transaction_sum', 'transaction_diff']])

Unnamed: 0,customer_id,transaction_date,amount,transaction_sum,transaction_diff
556196,9822,2007-08-31,-10000000000.0,1703.45,-10000000000.0
556197,9822,2007-08-12,-10000000000.0,-73.91,-10000000000.0
556198,9822,2007-08-27,-10000000000.0,-898.84,-9999999000.0
556199,9822,2007-08-29,-10000000000.0,-102.68,-10000000000.0
556200,9822,2007-08-10,-10000000000.0,-291.17,-10000000000.0
556201,9822,2007-08-17,-10000000000.0,-463.53,-10000000000.0
556202,9822,2007-08-12,-10000000000.0,-749.91,-9999999000.0
1419211,30441,2011-10-31,-10000000000.0,1577.41,-10000000000.0
1419212,30441,2011-10-24,-10000000000.0,-1005.71,-9999999000.0
1419213,30441,2011-10-04,-10000000000.0,-685.87,-9999999000.0


In [175]:
# Option 1 is to adjust amount so that it matches the transaction sum
ttemp_df["amount"] = np.where(ttemp_df["transaction_diff"] != 0, ttemp_df["transaction_sum"], ttemp_df["amount"])

In [177]:
ttemp_df["transaction_diff"] = ttemp_df["amount"] - ttemp_df["transaction_sum"]
filter1 = ttemp_df[ttemp_df["transaction_diff"] != 0]
filter1.count()

date                0
account_id          0
customer_id         0
amount              0
transaction_date    0
deposit             0
withdrawal          0
transaction_sum     0
transaction_diff    0
dtype: int64

In [None]:
# The other option would be be to change the withdrawal and deposits to match

Fixing the amount

making it equal the sum of withdrawal and deposit

In [178]:
trans_df["amount"] = np.where(trans_df["amount"] != trans_df["withdrawal"] + trans_df["deposit"], trans_df["withdrawal"] + trans_df["deposit"], trans_df["amount"])

In [179]:
trans_df['amount'].equals(trans_df["deposit"] + trans_df["withdrawal"])


True

#### Adding an account_total column: 

By grouping by customer_id and sorting by transaction_date

In [181]:
trans_df['account_total'] = trans_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

#### Casting to date type

In [182]:
trans_df['date'] = pd.to_datetime(trans_df['date'])
trans_df['transaction_date'] = pd.to_datetime(trans_df['transaction_date'])

#### Testing Merge options

In [183]:
month_trans = trans_df.groupby(['customer_id', trans_df.transaction_date.dt.to_period("m")])["amount"].sum()

In [184]:
merge_df = pd.merge(trans_df[['account_id', 'customer_id', 'transaction_date', 'amount', 'account_total']], cust_df, on = 'customer_id', how = 'left')

In [185]:
display(merge_df.head())


Unnamed: 0,account_id,customer_id,transaction_date,amount,account_total,dob,state,start_balance,creation_date
0,24137947,91,2007-01-31,3034.26,-2260.92,1993-07-01,California,10180.56,2007-01-31
1,24137947,91,2007-01-16,-5295.18,-5295.18,1993-07-01,California,10180.56,2007-01-31
2,24137947,91,2007-02-28,0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31
3,24137947,91,2007-03-30,-0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31
4,24137947,91,2007-03-11,-0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31


In [186]:
sum_df = merge_df[['amount', 'start_balance']].sum(axis=1)

#### Creating temp table that is grouped by month

In [187]:
# temp_df = trans_df.groupby(['customer_id', 'account_id',
#                            trans_df.transaction_date.dt.to_period("M")], as_index = False).agg({'amount': sum, 'deposit': sum, 'withdrawal': sum})

# Use temp table below
temp_df = trans_df.groupby(['customer_id', 'account_id',
                            pd.Grouper(key = 'transaction_date', freq = "M")], 
                            as_index= False).agg({'amount': sum, 
                                                  'deposit': sum, 
                                                  'withdrawal': sum})

In [188]:
print(temp_df)

         customer_id  account_id transaction_date   amount  deposit  \
0                 91    24137947       2007-01-31 -2260.92  3034.26   
1                 91    24137947       2007-02-28     0.00     0.00   
2                 91    24137947       2007-03-31     0.00     0.00   
3                 92    24137948       2007-01-31     0.00     0.00   
4                 92    24137948       2007-02-28  1164.90  1164.90   
...              ...         ...              ...      ...      ...   
2129116       116102    24253958       2020-05-31  -350.95   872.99   
2129117       116103    24253959       2020-05-31 -1003.22  3354.40   
2129118       116104    24253960       2020-05-31 -1102.10  2900.20   
2129119       116105    24253961       2020-05-31  2246.93  2246.93   
2129120       116106    24253962       2020-05-31    56.99    56.99   

         withdrawal  
0          -5295.18  
1              0.00  
2              0.00  
3              0.00  
4              0.00  
...            

In [189]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129121 entries, 0 to 2129120
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   customer_id       int64         
 1   account_id        int64         
 2   transaction_date  datetime64[ns]
 3   amount            float64       
 4   deposit           float64       
 5   withdrawal        float64       
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 97.5 MB


#### Merging data to look at each customer by month

In [190]:
tmer_df = pd.merge(cust_df, temp_df, on = 'customer_id', how = 'left')

In [191]:
tmer_df['transaction_total'] = tmer_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

In [192]:
tmer_df['account_total'] = tmer_df['transaction_total'] + tmer_df['start_balance']

In [193]:
tmer_df['account_total']

0           7919.64
1           7919.64
2           7919.64
3           4757.68
4           5922.58
             ...   
2129116     2760.01
2129117    11060.70
2129118     9909.26
2129119    11108.01
2129120      258.31
Name: account_total, Length: 2129121, dtype: float64

##### Creating a copy of the temp merge df to clean up a bit

In [194]:
monthly_look_df = tmer_df.copy()

In [195]:
monthly_look_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129121 entries, 0 to 2129120
Data columns (total 12 columns):
 #   Column             Dtype         
---  ------             -----         
 0   customer_id        int64         
 1   dob                object        
 2   state              object        
 3   start_balance      float64       
 4   creation_date      object        
 5   account_id         int64         
 6   transaction_date   datetime64[ns]
 7   amount             float64       
 8   deposit            float64       
 9   withdrawal         float64       
 10  transaction_total  float64       
 11  account_total      float64       
dtypes: datetime64[ns](1), float64(6), int64(2), object(3)
memory usage: 194.9+ MB


##### Cleaning up this temp table 

so it just tells us about the account balance and transactions

In [196]:
monthly_look_df.drop(columns= ['dob', 'state', 'withdrawal', 'deposit'], inplace= True)
monthly_look_df["transaction_date"] = monthly_look_df.transaction_date.dt.to_period("M")
monthly_look_df = monthly_look_df.rename(columns={'transaction_date': "transaction_month", "amount": "transaction_amount"})

In [197]:
monthly_look_df.head()

Unnamed: 0,customer_id,start_balance,creation_date,account_id,transaction_month,transaction_amount,transaction_total,account_total
0,91,10180.56,2007-01-31,24137947,2007-01,-2260.92,-2260.92,7919.64
1,91,10180.56,2007-01-31,24137947,2007-02,0.0,-2260.92,7919.64
2,91,10180.56,2007-01-31,24137947,2007-03,0.0,-2260.92,7919.64
3,92,4757.68,2007-01-31,24137948,2007-01,0.0,0.0,4757.68
4,92,4757.68,2007-01-31,24137948,2007-02,1164.9,1164.9,5922.58
