# CP3403 Data Mining
## Report: Credit Card Fraud

### Group: Matthew Marsh, Dannielle Jones and Callum Gracie

This data mining explores: A time series to see if there is a relationship between city population to fraud cases over time.

# Import Packages and Get Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from datetime import datetime, date
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf

In [2]:
data = pd.read_csv('data/fraudTrain.csv')  # Read
#pd.set_option('display.float_format', lambda x:'%f'%x)  # Format

In [3]:
#print('Number of Records: {}'.format(len(data)))
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
data.head()

Dataset rows: 1296675 columns: 23


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


# Pre-Processing: NaN Data and Missing Data

In [4]:
# Check dataset for missing or NaN values
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
missing_values_count = data.isna().sum()
print(missing_values_count)

Dataset rows: 1296675 columns: 23
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


# Pre-Processing: Convert and Format Data

In [5]:
# View the unnamed column, then get and rename
print(data.iloc[:,0])
data = data.rename(columns={data.columns[0]: "column_id"})
data.set_index('column_id', inplace=True)
data.head()

0                0
1                1
2                2
3                3
4                4
            ...   
1296670    1296670
1296671    1296671
1296672    1296672
1296673    1296673
1296674    1296674
Name: Unnamed: 0, Length: 1296675, dtype: int64


Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [6]:
# Convert to numeric
data['amt'] = pd.to_numeric(data['amt'], errors='coerce')
data['zip'] = pd.to_numeric(data['zip'], errors='coerce')
data['lat'] = pd.to_numeric(data['lat'], errors='coerce')
data['long'] = pd.to_numeric(data['long'], errors='coerce')
data['city_pop'] = pd.to_numeric(data['city_pop'], errors='coerce')
data['merch_lat'] = pd.to_numeric(data['merch_long'], errors='coerce')
data['is_fraud'] = pd.to_numeric(data['is_fraud'], errors='coerce')

In [7]:
# Processing date of birth
data['dob'] = pd.to_datetime(data['dob'])  # convert to datetime object
data['year_of_birth'] = data['dob'].dt.year  # extract year
data['month_of_birth'] = data['dob'].dt.month  # extract month
data['day_of_birth'] = data['dob'].dt.day  # extract day
data.head()

Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,year_of_birth,month_of_birth,day_of_birth
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,-82.048315,-82.048315,0,1988,3,9
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,-118.186462,-118.186462,0,1978,6,21
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,-112.154481,-112.154481,0,1962,1,19
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,-112.561071,-112.561071,0,1967,1,12
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,-78.632459,-78.632459,0,1986,3,28


In [8]:
# Processing transaction date and time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])  # convert to datetime object
data['year_of_trans'] = data['trans_date_trans_time'].dt.year  # extract year
data['month_of_trans'] = data['trans_date_trans_time'].dt.month  # extract month
data['day_of_trans'] = data['trans_date_trans_time'].dt.day  # extract day
data['time_of_trans'] = data['trans_date_trans_time'].dt.time  # extract time
data.head()

Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,merch_lat,merch_long,is_fraud,year_of_birth,month_of_birth,day_of_birth,year_of_trans,month_of_trans,day_of_trans,time_of_trans
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-82.048315,-82.048315,0,1988,3,9,2019,1,1,00:00:18
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.186462,-118.186462,0,1978,6,21,2019,1,1,00:00:44
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.154481,-112.154481,0,1962,1,19,2019,1,1,00:00:51
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.561071,-112.561071,0,1967,1,12,2019,1,1,00:01:16
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-78.632459,-78.632459,0,1986,3,28,2019,1,1,00:03:06


In [9]:
# Processing Gender into binary
gender_count = data['gender'].value_counts()
data['is_female'] = data['gender'].apply(lambda x: 1 if x.upper() == "F" else 0)
is_female_count = data['is_female'].value_counts()
print("Gender Count: \n{}".format(gender_count))
print("is_female Count: \n{}".format(is_female_count))
data.head()

Gender Count: 
F    709863
M    586812
Name: gender, dtype: int64
is_female Count: 
1    709863
0    586812
Name: is_female, dtype: int64


Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,merch_long,is_fraud,year_of_birth,month_of_birth,day_of_birth,year_of_trans,month_of_trans,day_of_trans,time_of_trans,is_female
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-82.048315,0,1988,3,9,2019,1,1,00:00:18,1
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.186462,0,1978,6,21,2019,1,1,00:00:44,1
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.154481,0,1962,1,19,2019,1,1,00:00:51,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.561071,0,1967,1,12,2019,1,1,00:01:16,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-78.632459,0,1986,3,28,2019,1,1,00:03:06,0


In [None]:
sub1 = data.copy()

# Pre-Processing: Create Data Sub-Set

In [10]:
# Reduce sample size for computation: Get random sub-sample
np.random.seed(42)
sub_fraction = 0.01

random_fraction_sub = sub1.sample(frac=sub_fraction, random_state=42)
print(f"Current size of data: {len(sub1)} \n")
print(f"Records Count: {len(random_fraction_sub)}")

sub1 = random_fraction_sub
print('Subset rows: {} columns: {}'.format(sub1.shape[0], sub1.shape[1]))
sub1.head()

Subset rows: 20063 columns: 30


Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,merch_long,is_fraud,year_of_birth,month_of_birth,day_of_birth,year_of_trans,month_of_trans,day_of_trans,time_of_trans,is_female
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
161,2019-01-01 01:59:23,4646845581490336108,"fraud_Monahan, Bogisich and Ledner",misc_pos,10.26,Julia,Bell,F,576 House Crossroad,West Sayville,...,-73.523504,0,1990,6,25,2019,1,1,01:59:23,1
293,2019-01-01 03:49:56,4646845581490336108,fraud_Block Group,misc_pos,1.1,Julia,Bell,F,576 House Crossroad,West Sayville,...,-72.974367,0,1990,6,25,2019,1,1,03:49:56,1
360,2019-01-01 04:42:23,38588538868506,"fraud_Schaefer, Maggio and Daugherty",gas_transport,60.2,Jacqueline,Curry,F,3047 Jeff Place,Marathon,...,-103.852785,0,1990,11,23,2019,1,1,04:42:23,1
381,2019-01-01 05:01:34,4939976756738216,"fraud_Bernier, Streich and Jewess",grocery_net,49.54,Michelle,Johnston,F,3531 Hamilton Highway,Roma,...,-99.814054,0,1990,11,7,2019,1,1,05:01:34,1
404,2019-01-01 05:19:42,4464457352619,fraud_Vandervort-Funk,grocery_pos,124.33,Breanna,Rodriguez,F,118 Cabrera Springs Apt. 105,Lanark Village,...,-83.837856,0,1990,1,24,2019,1,1,05:19:42,1


In [11]:
# Check how many cases are fraud
is_fraud_count = sub1[(sub1['is_fraud'] == 1)]
print('Fraud count: {}'.format(len(is_fraud_count)))
is_not_fraud_count = sub1[(sub1['is_fraud'] != 1)]
print('Non-Fraud count: {}'.format(len(is_not_fraud_count)))
is_fraud_count.head()

Fraud count: 88


Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,merch_long,is_fraud,year_of_birth,month_of_birth,day_of_birth,year_of_trans,month_of_trans,day_of_trans,time_of_trans,is_female
column_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12246,2019-01-08 01:50:29,4859525594182537,fraud_Howe Ltd,misc_pos,664.73,Rebecca,Farley,F,383 Long Islands,Downsville,...,-75.954718,1,1990,2,25,2019,1,8,01:50:29,1
12329,2019-01-08 03:00:11,4859525594182537,fraud_Cormier LLC,shopping_net,1127.56,Rebecca,Farley,F,383 Long Islands,Downsville,...,-75.497389,1,1990,2,25,2019,1,8,03:00:11,1
12386,2019-01-08 03:37:18,4859525594182537,fraud_Durgan-Auer,misc_net,722.92,Rebecca,Farley,F,383 Long Islands,Downsville,...,-75.411206,1,1990,2,25,2019,1,8,03:37:18,1
12448,2019-01-08 04:26:13,4859525594182537,fraud_Murray-Smitham,grocery_pos,333.09,Rebecca,Farley,F,383 Long Islands,Downsville,...,-75.716922,1,1990,2,25,2019,1,8,04:26:13,1
13481,2019-01-08 16:07:05,4859525594182537,fraud_Douglas-White,entertainment,564.57,Rebecca,Farley,F,383 Long Islands,Downsville,...,-74.85479,1,1990,2,25,2019,1,8,16:07:05,1


In [12]:
sub2 = sub1.copy()

# Data Mining Technique/Method: Time Series
## Visualisation: Pre-Processing

In [None]:
# Set date/time as index and sub-sample relevant columns
sub2['trans_date'] = sub2['trans_date_trans_time'].dt.date  # extract date
sub2.set_index('trans_date', inplace=True)
sub2 = [['trans_date', 'city_pop', 'is_fraud']]

In [None]:
sub2.head(2)  # View start date

In [None]:
sub2.tail(2)  # View end date

In [None]:
# Check city population range
print(sub2['city_pop'].describe())

In [None]:
# Slice specific time frame
date_range_from = '1990-01-01'
date_range_to = '2020-01-01'
select_year = '1972'

transaction_all_data = sub2.copy()  # All dates
transaction_all_data['Year'] = transaction_all_data.index.year
transaction_all_data.head()

In [None]:
transaction_range_data = sub2[date_range_from:date_range_to]  # Range of date
transaction_range_data['Year'] = transaction_range_data.index.year
transaction_range_data.head()

In [None]:
transaction_year_data = sub2.loc[select_year]  # Specific year
transaction_year_data['Month'] = transaction_year_data.index.month  # Specific year indexed by month
transaction_year_data.head()

## Visualisation: Plots/Graphs

In [None]:
%matplotlib inline

# Plot of transaction data - all
plt.title("Time Series Plot Transactions to City Population")
plt.plot(transaction_all_data)

In [None]:
plt.title("Box Plot Transactions to City Population")
ax = sns.boxplot(data=transaction_all_data, x='Year', y='city_pop')

In [None]:
# Plot of transaction data - date range
plt.title(f"Time Series Plot Transactions to City Population Between {date_range_from} to {date_range_to}")
plt.plot(transaction_year_data)

In [None]:
plt.title("Box Plot Transactions to City Population")
ax3 = sns.boxplot(data=transaction_range_data, x='Year', y='city_pop')

In [None]:
# Plot of transaction data - year
plt.title(f"Time Series Plot Transactions to City Population in Year {select_year}")
plt.plot(transaction_year_data)

In [None]:
plt.title("Box Plot Transactions to City Population")
ax2 = sns.boxplot(data=transaction_year_data, x='Month', y='city_pop')

### Stationary Check

In [None]:
# Perform Stationary Test
def test_stationary(timeseries, number_of_months):

    #Determing rolling statistics
    rolling_mean = timeseries.rolling(window=number_of_months).mean()
    rolling_std = timeseries.rolling(window=number_of_months).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
    std = plt.plot(rolling_std, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)

In [None]:
print("Stationary Test - ALl Data")
test_stationary(transaction_all_data['city_pop'], 12)
print()

print(f"Stationary Test - Range {date_range_from} to {date_range_to}")
test_stationary(transaction_range_data['city_pop'], 12)
print()

print(f"Stationary Test - Year {select_year}")
test_stationary(transaction_year_data['city_pop'], 12)
print()

In [None]:
# Perform Dickey-Fuller Test
def test_dickey_fuller(timeseries):
    print('Results of Dickey-Fuller Test:')
    df_test = adfuller(timeseries, autolag='AIC')

    df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in df_test[4].items():
        df_output['Critical Value (%s)'%key] = value
    print (df_output)

In [None]:
print("Dickey Fuller Test - ALl Data")
test_dickey_fuller(transaction_all_data['city_pop'])
print()

print(f"Dickey Fuller Test - Range {date_range_from} to {date_range_to}")
test_dickey_fuller(transaction_range_data['city_pop'])
print()

print(f"Dickey Fuller Test - Year {select_year}")
test_dickey_fuller(transaction_year_data['city_pop'])
print()

#### Conclusions:


### Auto-correlation

In [None]:
# Perform auto-correlation lag
def auto_correlation_lag(dataframe, column_name, number_of_months):
    """
    Perform auto correlation lag
    :param dataframe:
    :param column_name:
    :param number_of_months:
    :return: auto lag correlation by number of months
    """
    return dataframe[column_name].autocorr(lag=number_of_months)

In [None]:
lag_month_number = 1

print(f"All Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")
print(f"Range {date_range_from} to {date_range_to} Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")
print(f"Year {select_year } Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")

### Decomposition

In [None]:
# Get log
def get_log(dataframe, column_name):
    """
    Find log by dataframe and column name
    :param dataframe:
    :param column_name:
    :return: log
    """
    return np.log(dataframe[column_name])

In [None]:
ts_log_all = get_log(transaction_all_data, 'city_pop')
ts_log_range = get_log(transaction_range_data, 'city_pop')
ts_log_year = get_log(transaction_year_data, 'city_pop')

In [None]:
plt.plot(ts_log_all)

In [None]:
plt.plot(ts_log_range)

In [None]:
plt.plot(ts_log_year)

In [None]:
decompose_all = seasonal_decompose(transaction_all_data['city_pop'], model='additive', period=12)
decompose_all.plot()
plt.title("Decompose of Transactions to City Population")
plt.show()

In [None]:
decompose_range = seasonal_decompose(transaction_range_data['city_pop'], model='additive', period=12)
decompose_range.plot()
plt.title(f"Decompose of Transactions to City Population Between {date_range_from} to {date_range_to}")
plt.show()

In [None]:
decompose_year = seasonal_decompose(transaction_year_data['city_pop'], model='additive', period=12)
decompose_year.plot()
plt.title(f"Decompose of Transactions to City Population in Year {select_year}")
plt.show()

### Forecasting

In [None]:
# Split into train and test data
train_all = transaction_all_data[transaction_all_data['Year'] < pd.to_datetime('1990-01', format='%Y-%m')]
train_all['train'] = train_all['city_pop']
train_all.head()

## Visualisation: Results and Data