In [17]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
#import useful libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from random import randint

In [18]:
data = pd.read_csv('bank_transactions.csv', delimiter=',')

In [3]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5


# Finding Similar Customers

## 1.1 Set up the date

clean the dataset

In [19]:
#check for any NA values
any(data.isna())

True

In [20]:
data.dropna(inplace=True)

convert string to datetime

In [21]:
data.CustomerDOB = pd.to_datetime(data.CustomerDOB)
data.TransactionDate = pd.to_datetime(data.TransactionDate)

data.drop(data[data.CustomerDOB.dt.year > 1998].index, axis = 0, inplace = True)

data.drop(data[data.CustomerDOB == 1800].index, axis = 0, inplace=True)

data = data.reset_index()

In [6]:
data

Unnamed: 0,index,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,0,T1,C5841053,1994-10-01,F,JAMSHEDPUR,17819.05,2016-02-08,143207,25.0
1,2,T3,C4417068,1996-11-26,F,MUMBAI,17874.44,2016-02-08,142712,459.0
2,3,T4,C5342380,1973-09-14,F,MUMBAI,866503.21,2016-02-08,142714,2060.0
3,4,T5,C9031234,1988-03-24,F,NAVI MUMBAI,6714.43,2016-02-08,181156,1762.5
4,5,T6,C1536588,1972-08-10,F,ITANAGAR,53609.20,2016-02-08,173940,676.0
...,...,...,...,...,...,...,...,...,...,...
965271,1048562,T1048563,C8020229,1990-08-04,M,NEW DELHI,7635.19,2016-09-18,184824,799.0
965272,1048563,T1048564,C6459278,1992-02-20,M,NASHIK,27311.42,2016-09-18,183734,460.0
965273,1048564,T1048565,C6412354,1989-05-18,M,HYDERABAD,221757.06,2016-09-18,183313,770.0
965274,1048565,T1048566,C6420483,1978-08-30,M,VISAKHAPATNAM,10117.87,2016-09-18,184706,1000.0


We are going to keep only the rows We are going to use later in the query. Thus, we are going to get rid of *TransactionID* and *CustomerID*

In [22]:
data = data[['CustomerDOB','CustGender','CustLocation','CustAccountBalance','TransactionDate','TransactionTime','TransactionAmount (INR)']]

First thing to do is convert our values in int32 dtype in order to apply the hash function

In [24]:
#let's define some functions
def float_to_int(value):
    integer,decimal  = str(value).split('.')
    return int(integer+decimal)

def date_to_int(value):
    return int(pd.Timestamp(value).timestamp())

def string_to_int(value):
    return sum(ord(x) for x in value)

In [25]:
data.CustomerDOB = data['CustomerDOB'].apply(date_to_int)
data.CustGender = data['CustGender'].apply(string_to_int)
data.CustLocation = data['CustLocation'].apply(string_to_int)
data.CustAccountBalance = data['CustAccountBalance'].apply(float_to_int)
data.TransactionDate = data['TransactionDate'].apply(date_to_int)
data['TransactionAmount (INR)'] = data['TransactionAmount (INR)'].apply(float_to_int)


In [26]:
data

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,780969600,70,755,1781905,1454889600,143207,250
1,848966400,70,443,1787444,1454889600,142712,4590
2,116812800,70,443,86650321,1454889600,142714,20600
3,575164800,70,777,671443,1454889600,181156,17625
4,82252800,70,583,536092,1454889600,173940,6760
...,...,...,...,...,...,...,...
965271,649728000,77,624,763519,1474156800,184824,7990
965272,698544000,77,446,2731142,1474156800,183734,4600
965273,611452800,77,644,22175706,1474156800,183313,7700
965274,273283200,77,968,1011787,1474156800,184706,10000


In [27]:
#obtain max value for x
data.max()

CustomerDOB                 913507200
CustGender                         84
CustLocation                     2609
CustAccountBalance         5536968816
TransactionDate            1481241600
TransactionTime                235959
TransactionAmount (INR)     156003499
dtype: int64

In [34]:
#let's get the next prime number bigger than x

from sympy import nextprime

max_value_of_x = nextprime(5536968816)

## 1.2 Fingerprint Hashing

The hash function we want to implement is in the following form: <br>
$\begin{equation} h(x)=(ax+b)\mod{5}\end{equation}$ <br>
where: <br>
- *x*: input value
- *a*, *b*: randomly choosen integers less than the maximum value of *x*
- *c*: the next prime number bigger than the maximum value of *x*

In [29]:
def hash(val, a, b, c):
    return (a*val+b)%c

Now it's time to implement the MinHash

In [40]:
def min_hash(info,c ,list_of_tuples):

    signatures = []

    for idx, x in enumerate(info):
        hash_value = []
        for values in list_of_tuples:
            a,b = values
            ans = hash(x,a,b,c,)
            hash_value.append(ans)
        signatures.append(int(min(hash_value)))
        
    return signatures

In [38]:
def random_coef(n_hash, M):

    a = []
    b = []

    for _ in range(n_hash):
        a_i = randint(0,M)
        a.append(a_i)
        b_i = randint(0,M)
        b.append(b_i)
    return a,b

In [41]:
c = max_value_of_x
M = 2**32 -1
n_hash = 10

a,b = random_coef(n_hash, M)

list_of_tuples = list(
    map(
        lambda x, y: (x, y),
        a,
        b
    )
)

data['min-hash']  = [min_hash([*x[1]],c,list_of_tuples) for x in (data.iterrows())]

In [42]:
data

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),min-hash
0,780969600,70,755,1781905,1454889600,143207,250,"[86213410, 230466457, 204725688, 37511749, 488..."
1,848966400,70,443,1787444,1454889600,142712,4590,"[105868537, 230466457, 62637605, 12179878, 488..."
2,116812800,70,443,86650321,1454889600,142714,20600,"[1969221609, 230466457, 62637605, 182630827, 4..."
3,575164800,70,777,671443,1454889600,181156,17625,"[1086440428, 230466457, 1281134211, 532037098,..."
4,82252800,70,583,536092,1454889600,173940,6760,"[344004487, 230466457, 72927329, 1782436477, 4..."
...,...,...,...,...,...,...,...,...
965271,649728000,77,624,763519,1474156800,184824,7990,"[182023318, 399589135, 655293547, 594457950, 1..."
965272,698544000,77,446,2731142,1474156800,183734,4600,"[610201319, 399589135, 507016186, 389229926, 1..."
965273,611452800,77,644,22175706,1474156800,183313,7700,"[618434860, 399589135, 7119872, 45664866, 1348..."
965274,273283200,77,968,1011787,1474156800,184706,10000,"[769227346, 399589135, 256471850, 56146163, 13..."


## 1.3 Locality Sensitive Hashing (LSH)