# Analysis of historical merchant transactions

_Note! If you want to commit any changes to this document, please strip all output (Cell > Current Outputs > Clear, or set up [nbstripout](https://github.com/kynan/nbstripout) as a git filter) from this notebook before doing so. Thanks!_


## Import Libraries

Next we import the Python libraries we'll need. If any of these are missing for you, you can install them with e.g. `pip3 install pandas` on the command line.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Load Data

Load the data into Pandas data frames and look at their structure.

First thing we'll do with the training data is split it into a train and validation set. (The given test set is what we'll later make our predictions on and upload, but only after we are fully satisfied with our model.)

In [None]:
hist_trans_df = pd.read_csv('data/unzipped/historical_transactions.csv',
                            parse_dates=['purchase_date'])

In [None]:
hist_trans_df.head()

In [None]:
hist_trans_df.shape

## Data analysis

Here we want to see if there is any way to get rid of outliers and possibly group data into something meaningful?

In [None]:
hist_trans_df['purchase_amount'].nunique()

In [None]:
numOfRows = hist_trans_df.count()
print(numOfRows)

In [None]:
np.min(hist_trans_df['purchase_amount'])

In [None]:
np.max(hist_trans_df['purchase_amount'])

In [None]:
np.var(hist_trans_df['purchase_amount'])

In [None]:
# here I must add a new feature based on range and then plot that feature
#plt.hist(hist_trans_df['purchase_amount'], bins="scott")
#plt.xlabel('purchase amount')
#plt.ylabel('number of occurences')

In [None]:
(hist_trans_df['purchase_amount']).count()

In [None]:
valueCounts = hist_trans_df['purchase_amount'].value_counts()

In [None]:
valueCounts.head()

In [None]:
hist_trans_df[hist_trans_df.groupby('purchase_amount').pid.transform(len) > 1]

In [None]:
(hist_trans_df['purchase_amount']).count()

In [None]:
((hist_trans_df['purchase_amount'] >= 0) & (hist_trans_df['purchase_amount'] < 1)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -1) & (hist_trans_df['purchase_amount'] < 0)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -1) & (hist_trans_df['purchase_amount'] < -0.5)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -0.6) & (hist_trans_df['purchase_amount'] < -0.5)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -0.7) & (hist_trans_df['purchase_amount'] < -0.6)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -0.8) & (hist_trans_df['purchase_amount'] < -0.7)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -0.9) & (hist_trans_df['purchase_amount'] < -0.8)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -1) & (hist_trans_df['purchase_amount'] < -0.9)).sum()

In [None]:
((hist_trans_df['purchase_amount'] >= -0.7) & (hist_trans_df['purchase_amount'] < -0.6)).sum()

In [None]:
(hist_trans_df['purchase_amount'] < -1).sum()

In [None]:
(hist_trans_df['purchase_amount'] > 1).sum()

In [None]:
hist_trans_df['amount_class'] = np.where(hist_trans_df['purchase_amount']>=-0.50, 'HO')
hist_trans_df['amount_class'] = np.where(hist_trans_df['purchase_amount']<-0.8, 'LO')
hist_trans_df['amount_class'] = np.where(hist_trans_df['purchase_amount']>=-0.80, hist_trans_df['purchase_amount']<-0.79, '7J')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.79 & hist_trans_df['purchase_amount']<-0.78), '7I')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.78 & hist_trans_df['purchase_amount']<-0.77), '7H')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.77 & hist_trans_df['purchase_amount']<-0.76), '7G')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.76 & hist_trans_df['purchase_amount']<-0.75), '7F')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.75 & hist_trans_df['purchase_amount']<-0.74), '7E')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.74 & hist_trans_df['purchase_amount']<-0.73), '7D')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.73 & hist_trans_df['purchase_amount']<-0.72), '7C')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.72 & hist_trans_df['purchase_amount']<-0.71), '7B')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.71 & hist_trans_df['purchase_amount']<-0.70), '7A')

hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.70 & hist_trans_df['purchase_amount']<-0.69), '6J')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.69 & hist_trans_df['purchase_amount']<-0.68), '6I')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.68 & hist_trans_df['purchase_amount']<-0.67), '6H')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.67 & hist_trans_df['purchase_amount']<-0.66), '6G')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.66 & hist_trans_df['purchase_amount']<-0.65), '6F')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.65 & hist_trans_df['purchase_amount']<-0.64), '6E')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.64 & hist_trans_df['purchase_amount']<-0.63), '6D')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.63 & hist_trans_df['purchase_amount']<-0.62), '6C')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.62 & hist_trans_df['purchase_amount']<-0.61), '6B')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.61 & hist_trans_df['purchase_amount']<-0.60), '6A')

hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.60 & hist_trans_df['purchase_amount']<-0.59), '5J')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.59 & hist_trans_df['purchase_amount']<-0.58), '5I')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.58 & hist_trans_df['purchase_amount']<-0.57), '5H')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.57 & hist_trans_df['purchase_amount']<-0.56), '5G')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.56 & hist_trans_df['purchase_amount']<-0.55), '5F')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.55 & hist_trans_df['purchase_amount']<-0.54), '5E')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.54 & hist_trans_df['purchase_amount']<-0.53), '5D')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.53 & hist_trans_df['purchase_amount']<-0.52), '5C')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.52 & hist_trans_df['purchase_amount']<-0.51), '5B')
hist_trans_df['amount_class'] = np.where((hist_trans_df['purchase_amount']>=-0.51 & hist_trans_df['purchase_amount']<-0.50), '5A')
