In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import useful libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from random import randint
from datetime import datetime

In [3]:
data = pd.read_csv('bank_transactions.csv', delimiter=',')

In [4]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5


# Finding Similar Customers

## 1.1 Set up the date

clean the dataset

In [5]:
#check for any NA values
any(data.isna())

True

In [6]:
data.dropna(inplace=True)

convert string to datetime

In [7]:
data.CustomerDOB = pd.to_datetime(data.CustomerDOB)
data.TransactionDate = pd.to_datetime(data.TransactionDate)

data.drop(data[data.CustomerDOB.dt.year > 1998].index, axis = 0, inplace = True)

data.drop(data[data.CustomerDOB == 1800].index, axis = 0, inplace=True)

data = data.reset_index()

In [8]:
data

Unnamed: 0,index,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,0,T1,C5841053,1994-10-01,F,JAMSHEDPUR,17819.05,2016-02-08,143207,25.0
1,2,T3,C4417068,1996-11-26,F,MUMBAI,17874.44,2016-02-08,142712,459.0
2,3,T4,C5342380,1973-09-14,F,MUMBAI,866503.21,2016-02-08,142714,2060.0
3,4,T5,C9031234,1988-03-24,F,NAVI MUMBAI,6714.43,2016-02-08,181156,1762.5
4,5,T6,C1536588,1972-08-10,F,ITANAGAR,53609.20,2016-02-08,173940,676.0
...,...,...,...,...,...,...,...,...,...,...
965271,1048562,T1048563,C8020229,1990-08-04,M,NEW DELHI,7635.19,2016-09-18,184824,799.0
965272,1048563,T1048564,C6459278,1992-02-20,M,NASHIK,27311.42,2016-09-18,183734,460.0
965273,1048564,T1048565,C6412354,1989-05-18,M,HYDERABAD,221757.06,2016-09-18,183313,770.0
965274,1048565,T1048566,C6420483,1978-08-30,M,VISAKHAPATNAM,10117.87,2016-09-18,184706,1000.0


We are going to keep only the rows We are going to use later in the query. Thus, we are going to get rid of *TransactionID* and *CustomerID*

In [9]:
data = data[['CustomerDOB','CustGender','CustLocation','CustAccountBalance','TransactionDate','TransactionTime','TransactionAmount (INR)']]

In [10]:
def DOB_to_age(df):
    now = int(datetime.strftime(datetime.today().date()).split('-')[0])
    age = df.apply(lambda x: now - x)
    return age 


In [11]:
data['CustomerDOB'] = data['CustomerDOB'].apply(lambda x: int(datetime.strftime(x, "%Y-%m-%w").split('-')[0]))

In [12]:
today = int(datetime.strftime(datetime.today().date(), "%Y-%m-%w").split('-')[0])

In [13]:
data['Age'] = data['CustomerDOB'].apply(lambda x: today - x)

In [14]:
data.drop(['CustomerDOB'],axis = 1)

Unnamed: 0,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),Age
0,F,JAMSHEDPUR,17819.05,2016-02-08,143207,25.0,28
1,F,MUMBAI,17874.44,2016-02-08,142712,459.0,26
2,F,MUMBAI,866503.21,2016-02-08,142714,2060.0,49
3,F,NAVI MUMBAI,6714.43,2016-02-08,181156,1762.5,34
4,F,ITANAGAR,53609.20,2016-02-08,173940,676.0,50
...,...,...,...,...,...,...,...
965271,M,NEW DELHI,7635.19,2016-09-18,184824,799.0,32
965272,M,NASHIK,27311.42,2016-09-18,183734,460.0,30
965273,M,HYDERABAD,221757.06,2016-09-18,183313,770.0,33
965274,M,VISAKHAPATNAM,10117.87,2016-09-18,184706,1000.0,44


55369688.16

First thing to do is convert our values in int32 dtype in order to apply the hash function

In [23]:
#let's define some functions
def float_to_int(value):
    return int(value)

def date_to_int(value):
    return int(pd.Timestamp(value).timestamp())

def string_to_int(value):
    return sum(ord(x) for x in value)

In [24]:
data.CustomerDOB = data['CustomerDOB'].apply(date_to_int)
data.CustGender = data['CustGender'].apply(string_to_int)
data.CustLocation = data['CustLocation'].apply(string_to_int)
data.CustAccountBalance = data['CustAccountBalance'].apply(float_to_int)
data.TransactionDate = data['TransactionDate'].apply(date_to_int)
data['TransactionAmount (INR)'] = data['TransactionAmount (INR)'].apply(float_to_int)


In [26]:
data.drop(['CustomerDOB'], axis = 1)

Unnamed: 0,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),Age
0,70,755,17819,1454889600,143207,25,28
1,70,443,17874,1454889600,142712,459,26
2,70,443,866503,1454889600,142714,2060,49
3,70,777,6714,1454889600,181156,1762,34
4,70,583,53609,1454889600,173940,676,50
...,...,...,...,...,...,...,...
965271,77,624,7635,1474156800,184824,799,32
965272,77,446,27311,1474156800,183734,460,30
965273,77,644,221757,1474156800,183313,770,33
965274,77,968,10117,1474156800,184706,1000,44


In [13]:
#obtain max value for x
data.max()

CustomerDOB                 913507200
CustGender                         84
CustLocation                     2609
CustAccountBalance         5536968816
TransactionDate            1481241600
TransactionTime                235959
TransactionAmount (INR)     156003499
dtype: int64

In [18]:
#let's get the next prime number bigger than x

from sympy import nextprime

max_value_of_x = nextprime(5536968816)

## 1.2 Fingerprint Hashing

The hash function we want to implement is in the following form: <br>
$\begin{equation} h(x)=(ax+b)\mod{c}\end{equation}$ <br>
where: <br>
- *x*: input value
- *a*, *b*: randomly choosen integers less than the maximum value of *x*
- *c*: the next prime number bigger than the maximum value of *x*

In [14]:
def hash(val, a, b, c):
    return (a*val+b)%c

Now it's time to implement the MinHash

In [15]:
def min_hash(info,c ,list_of_tuples):

    signatures = []

    for idx, x in enumerate(info):
        hash_value = []
        for values in list_of_tuples:
            a,b = values
            ans = hash(x,a,b,c,)
            hash_value.append(ans)
        signatures.append(int(min(hash_value)))
        
    return signatures

In [16]:
def random_coef(n_hash, M):

    a = []
    b = []

    for _ in range(n_hash):
        a_i = randint(0,M)
        a.append(a_i)
        b_i = randint(0,M)
        b.append(b_i)
    return a,b

In [19]:
c = max_value_of_x
M = 2**32 -1
n_hash = 10

a,b = random_coef(n_hash, M)

list_of_tuples = list(
    map(
        lambda x, y: (x, y),
        a,
        b
    )
)

data['min-hash']  = [min_hash([*x[1]],c,list_of_tuples) for x in (data.iterrows())]

In [20]:
data

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),min-hash
0,780969600,70,755,1781905,1454889600,143207,250,"[349405143, 138083060, 332039731, 113786715, 3..."
1,848966400,70,443,1787444,1454889600,142712,4590,"[134267795, 138083060, 105465929, 1139377540, ..."
2,116812800,70,443,86650321,1454889600,142714,20600,"[366748437, 138083060, 105465929, 212807710, 3..."
3,575164800,70,777,671443,1454889600,181156,17625,"[1310188223, 138083060, 266710262, 153709876, ..."
4,82252800,70,583,536092,1454889600,173940,6760,"[137471242, 138083060, 506315829, 1878982801, ..."
...,...,...,...,...,...,...,...,...
965271,649728000,77,624,763519,1474156800,184824,7990,"[171530567, 421198851, 470886692, 1168301486, ..."
965272,698544000,77,446,2731142,1474156800,183734,4600,"[355583922, 421198851, 34228859, 821493571, 36..."
965273,611452800,77,644,22175706,1474156800,183313,7700,"[2551649, 421198851, 115017877, 399770915, 368..."
965274,273283200,77,968,1011787,1474156800,184706,10000,"[50999260, 421198851, 899894843, 540661260, 36..."


## 1.3 Locality Sensitive Hashing (LSH)

Now let's apply the same idea on the query, and then apply the Jaccard similarity since MinHash is stricly related to it

In [21]:
query = pd.read_csv('query_users.csv')

In [22]:
query.head()

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,27/7/78,M,DELHI,94695.61,2/9/16,140310,65.0
1,6/11/92,M,PANCHKULA,7584.09,2/9/16,120214,6025.0
2,14/8/91,M,PATNA,7180.6,10/8/16,221732,541.5
3,3/1/87,M,CHENNAI,56847.75,29/8/16,144138,1000.0
4,4/1/95,M,GURGAON,84950.13,25/9/16,233309,80.0


pre-processing of the query

In [None]:
#convert
query.CustomerDOB = pd.to_datetime(data.CustomerDOB)
query.TransactionDate = pd.to_datetime(data.TransactionDate)

In [24]:
query.CustomerDOB = query['CustomerDOB'].apply(date_to_int)
query.CustGender = query['CustGender'].apply(string_to_int)
query.CustLocation = query['CustLocation'].apply(string_to_int)
query.CustAccountBalance = query['CustAccountBalance'].apply(float_to_int)
query.TransactionDate = query['TransactionDate'].apply(date_to_int)
query['TransactionAmount (INR)'] = query['TransactionAmount (INR)'].apply(float_to_int)

In [25]:
query.head()

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,0,77,358,9469561,1454976000,140310,650
1,0,77,663,758409,1454976000,120214,60250
2,0,77,372,71806,1475884800,221732,5415
3,0,77,502,5684775,1472428800,144138,10000
4,0,77,531,8495013,1474761600,233309,800


In [26]:
query['min-hash']  = [min_hash([*x[1]],c,list_of_tuples) for x in (query.iterrows())]

In [28]:
query.head()

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),min-hash
0,0,77,358,9469561,1454976000,140310,650,"[195465954, 390010991, 421929114, 361433883, 1..."
1,0,77,663,758409,1454976000,120214,60250,"[195465954, 390010991, 678690419, 1775105, 128..."
2,0,77,372,71806,1475884800,221732,5415,"[195465954, 390010991, 137301131, 164726101, 6..."
3,0,77,502,5684775,1472428800,144138,10000,"[195465954, 390010991, 1544020780, 108320732, ..."
4,0,77,531,8495013,1474761600,233309,800,"[195465954, 390010991, 963394046, 148081170, 1..."


Now let's compute Jaccard's similarity. <br>
Jaccard's similarity is defined as:
$J(A,B)=\dfrac{|A \cap B|}{|A \cup B|}$

In [45]:
def jaccard(x,y):
    A = set([i for i in x])
    B = set([i for i in y])

    return len(A.intersection(B))/len(A.union(B))

In [67]:
out = jaccard(data['min-hash'].values[0], query['min-hash'].values[5])
print(out)

0.0


0.0