In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# import dataset and load in a dataframe
customer_file = r'..\data\customers_tm1_e.csv'
transaction_file = r'..\data\transactions_tm1_e.csv'
cust_df = pd.read_csv(customer_file)
trans_df = pd.read_csv(transaction_file)

In [5]:
cust_df.head()

Unnamed: 0,customer_id,dob,state,start_balance,creation_date
0,91,1993-07-01,California,10180.56,2007-01-31
1,92,1985-12-05,New York,4757.68,2007-01-31
2,93,1987-11-19,Minnesota,6796.72,2007-01-31
3,94,1981-03-23,Minnesota,9870.48,2007-01-31
4,95,1970-04-06,California,12500.72,2007-01-31


In [32]:
trans_df.head()

Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,account_total
0,2007-01-31,24137947,91,3034.26,2007-01-31,3034.26,0.0,-2260.92
1,2007-01-31,24137947,91,-5295.18,2007-01-16,0.0,-5295.18,-5295.18
2,2007-02-28,24137947,91,0.0,2007-02-28,0.0,0.0,-2260.92
3,2007-03-31,24137947,91,-0.0,2007-03-30,0.0,-0.0,-2260.92
4,2007-03-31,24137947,91,-0.0,2007-03-11,0.0,-0.0,-2260.92


In [12]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116016 entries, 0 to 116015
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   customer_id    116016 non-null  int64  
 1   dob            116016 non-null  object 
 2   state          116016 non-null  object 
 3   start_balance  116016 non-null  float64
 4   creation_date  116016 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 4.4+ MB


In [16]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4977972 entries, 0 to 4977971
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   date              datetime64[ns]
 1   account_id        int64         
 2   customer_id       int64         
 3   amount            float64       
 4   transaction_date  datetime64[ns]
 5   deposit           float64       
 6   withdrawal        float64       
 7   account_total     float64       
dtypes: datetime64[ns](2), float64(4), int64(2)
memory usage: 303.8 MB


In [9]:
is_missing = trans_df.isna()

missing_per_column = is_missing.sum(axis = 0) 
print(missing_per_column)              

date                0
account_id          0
customer_id         0
amount              0
transaction_date    0
deposit             0
withdrawal          0
dtype: int64


In [11]:
# replacing empty start balances with 0, but this should be edited with transaction data if the information is there.
cust_df['start_balance'].fillna(0, inplace=True)

In [33]:
print(f"Customer columns:\n{list(cust_df.columns)}\n")
print(f"Transactions columns:\n{list(trans_df.columns)}")

Customer columns:
['customer_id', 'dob', 'state', 'start_balance', 'creation_date']

Transactions columns:
['date', 'account_id', 'customer_id', 'amount', 'transaction_date', 'deposit', 'withdrawal', 'account_total']


In [14]:
#Adding an account_total column, by grouping by customer_id and sorting by transaction_date
trans_df['account_total'] = trans_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

In [15]:
#Casting to date type
trans_df['date'] = pd.to_datetime(trans_df['date'])
trans_df['transaction_date'] = pd.to_datetime(trans_df['transaction_date'])

In [51]:
month_trans = trans_df.groupby(['customer_id', trans_df.transaction_date.dt.to_period("m")])["amount"].sum()

In [52]:
print(month_trans)

customer_id  transaction_date
91           2007-01            -2260.92
             2007-02                0.00
             2007-03                0.00
92           2007-01                0.00
             2007-02             1164.90
                                  ...   
116102       2020-05             -350.95
116103       2020-05            -1003.22
116104       2020-05            -1102.10
116105       2020-05             2246.93
116106       2020-05               56.99
Name: amount, Length: 2129121, dtype: float64


In [53]:
merge_df = pd.merge(trans_df[['account_id', 'customer_id', 'transaction_date', 'amount', 'account_total']], cust_df, on = 'customer_id', how = 'left')

In [54]:
display(merge_df.head())


Unnamed: 0,account_id,customer_id,transaction_date,amount,account_total,dob,state,start_balance,creation_date
0,24137947,91,2007-01-31,3034.26,-2260.92,1993-07-01,California,10180.56,2007-01-31
1,24137947,91,2007-01-16,-5295.18,-5295.18,1993-07-01,California,10180.56,2007-01-31
2,24137947,91,2007-02-28,0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31
3,24137947,91,2007-03-30,-0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31
4,24137947,91,2007-03-11,-0.0,-2260.92,1993-07-01,California,10180.56,2007-01-31


In [55]:
sum_df = merge_df[['amount', 'start_balance']].sum(axis=1)

In [56]:
display(sum_df.head())

0    13214.82
1     4885.38
2    10180.56
3    10180.56
4    10180.56
dtype: float64

In [75]:
temp_df = trans_df.groupby(['customer_id', 'account_id',
                            trans_df.transaction_date.dt.to_period("M")], as_index = False).agg({'amount': sum, 'deposit': sum, 'withdrawal': sum})

In [79]:
temp_df = trans_df.groupby(['customer_id', 'account_id',
                            pd.Grouper(key = 'transaction_date', freq = "M")], as_index= False).agg({'amount': sum, 'deposit': sum, 'withdrawal': sum})

In [80]:
print(temp_df)

         customer_id  account_id transaction_date   amount  deposit  \
0                 91    24137947       2007-01-31 -2260.92  3034.26   
1                 91    24137947       2007-02-28     0.00     0.00   
2                 91    24137947       2007-03-31     0.00     0.00   
3                 92    24137948       2007-01-31     0.00     0.00   
4                 92    24137948       2007-02-28  1164.90  1164.90   
...              ...         ...              ...      ...      ...   
2129116       116102    24253958       2020-05-31  -350.95   872.99   
2129117       116103    24253959       2020-05-31 -1003.22  3354.40   
2129118       116104    24253960       2020-05-31 -1102.10  2900.20   
2129119       116105    24253961       2020-05-31  2246.93  2246.93   
2129120       116106    24253962       2020-05-31    56.99    56.99   

         withdrawal  
0          -5295.18  
1              0.00  
2              0.00  
3              0.00  
4              0.00  
...            

In [81]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129121 entries, 0 to 2129120
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   customer_id       int64         
 1   account_id        int64         
 2   transaction_date  datetime64[ns]
 3   amount            float64       
 4   deposit           float64       
 5   withdrawal        float64       
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 97.5 MB


In [84]:
tmer_df = pd.merge(cust_df, temp_df, on = 'customer_id', how = 'left')

In [87]:
print(tmer_df)

         customer_id         dob       state  start_balance creation_date  \
0                 91  1993-07-01  California       10180.56    2007-01-31   
1                 91  1993-07-01  California       10180.56    2007-01-31   
2                 91  1993-07-01  California       10180.56    2007-01-31   
3                 92  1985-12-05    New York        4757.68    2007-01-31   
4                 92  1985-12-05    New York        4757.68    2007-01-31   
...              ...         ...         ...            ...           ...   
2129116       116102  1969-03-02     Georgia        3110.96    2020-05-31   
2129117       116103  1966-10-25    New York       12063.92    2020-05-31   
2129118       116104  1986-04-30    Oklahoma       11011.36    2020-05-31   
2129119       116105  1983-08-25  New Mexico        8861.08    2020-05-31   
2129120       116106  1963-05-06        Ohio         201.32    2020-05-31   

         account_id transaction_date   amount  deposit  withdrawal  \
0    

In [92]:
tmer_df['transaction_total'] = tmer_df.sort_values(['customer_id', 'transaction_date'], ascending=True).groupby(['customer_id', 'account_id'])['amount'].cumsum()

In [93]:
tmer_df['account_total'] = tmer_df['transaction_total'] + tmer_df['start_balance']

In [94]:
tmer_df['account_total']

0           7919.64
1           7919.64
2           7919.64
3           4757.68
4           5922.58
             ...   
2129116     2760.01
2129117    11060.70
2129118     9909.26
2129119    11108.01
2129120      258.31
Name: account_total, Length: 2129121, dtype: float64

In [108]:
monthly_look_df = tmer_df.copy()

In [107]:
monthly_look_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129121 entries, 0 to 2129120
Data columns (total 8 columns):
 #   Column             Dtype    
---  ------             -----    
 0   customer_id        int64    
 1   start_balance      float64  
 2   creation_date      object   
 3   account_id         int64    
 4   transaction_date   period[M]
 5   amount             float64  
 6   account_total      float64  
 7   transaction_total  float64  
dtypes: float64(4), int64(2), object(1), period[M](1)
memory usage: 130.0+ MB


In [109]:
monthly_look_df.drop(columns= ['dob', 'state', 'withdrawal', 'deposit'], inplace= True)
monthly_look_df["transaction_date"] = monthly_look_df.transaction_date.dt.to_period("M")
monthly_look_df = monthly_look_df.rename(columns={'transaction_date': "transaction_month", "amount": "transaction_amount"})

In [110]:
monthly_look_df.head()

Unnamed: 0,customer_id,start_balance,creation_date,account_id,transaction_month,transaction_amount,account_total,transaction_total
0,91,10180.56,2007-01-31,24137947,2007-01,-2260.92,7919.64,-2260.92
1,91,10180.56,2007-01-31,24137947,2007-02,0.0,7919.64,-2260.92
2,91,10180.56,2007-01-31,24137947,2007-03,0.0,7919.64,-2260.92
3,92,4757.68,2007-01-31,24137948,2007-01,0.0,4757.68,0.0
4,92,4757.68,2007-01-31,24137948,2007-02,1164.9,5922.58,1164.9
