In [1]:
import numpy as np
import pandas as pd

# 1.1 Set up the data

To start working download the banking dataset on Kaggle.

In [2]:
data = pd.read_csv("bank_transactions.csv")

In [3]:
data  #to visualize the dataset as it is provided.

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5
...,...,...,...,...,...,...,...,...,...
1048562,T1048563,C8020229,8/4/90,M,NEW DELHI,7635.19,18/9/16,184824,799.0
1048563,T1048564,C6459278,20/2/92,M,NASHIK,27311.42,18/9/16,183734,460.0
1048564,T1048565,C6412354,18/5/89,M,HYDERABAD,221757.06,18/9/16,183313,770.0
1048565,T1048566,C6420483,30/8/78,M,VISAKHAPATNAM,10117.87,18/9/16,184706,1000.0


For the sake of this first part, not all columns are necessary since comparing each field single handedly can be quite time-expensive. Then, carefully read the linked guide above and try to understand which features will be appropriate for this task (An heads up: some users have more than one transaction record, make sure to handle them all). Once you have finished, project a version of the dataset to work with.

In [4]:
#In order to compare the data provided with that of the query in question 1.3 we drop the 'TransactionID' and 'CustomerID' columns.
data2 = data.drop(['TransactionID','CustomerID'],axis=1)

#We change the name of the column just for simplicity.
data2['TransactionAmount']=data2['TransactionAmount (INR)']
data2 = data2.drop(['TransactionAmount (INR)'],axis=1)

#We change the format of clients' date of birth so that they can be more processable.
#In addition, we create a new column with the customers' dates of birth.
data2.CustomerDOB = pd.to_datetime(data2.CustomerDOB)
data2.loc[data2.CustomerDOB.dt.year > 2000, "CustomerDOB"] = data2.loc[data2.CustomerDOB.dt.year > 2000, 'CustomerDOB'] - pd.DateOffset(years = 100)
data2.drop(data[data2.CustomerDOB.dt.year == 1800].index, axis = 0, inplace = True)
np.timedelta64(1, "Y")
data2['CustomerAge'] = (( pd.to_datetime('today') - data2.CustomerDOB ) / np.timedelta64(1, 'Y')).round(0)

#We round up the remaining customer transaction information so that it can be meaningfully compared with each other.
data2.TransactionAmount=data2.TransactionAmount.apply(lambda x: x//500*500)
data2.CustAccountBalance=data2.CustAccountBalance.apply(lambda x: x//10000*10000)
data2.TransactionTime=data2.TransactionTime.apply(lambda x: round(x/10000,0))
data2.CustomerAge=data2.CustomerAge.apply(lambda x: x//5*5)

In [16]:
data2 #to visualize the dataset ready to be processed.

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount,CustomerAge
0,1994-10-01,F,JAMSHEDPUR,10000.0,2/8/16,14.0,0.0,20.0
1,1957-04-04,M,JHAJJAR,0.0,2/8/16,14.0,27500.0,60.0
2,1996-11-26,F,MUMBAI,10000.0,2/8/16,14.0,0.0,20.0
3,1973-09-14,F,MUMBAI,860000.0,2/8/16,14.0,2000.0,40.0
4,1988-03-24,F,NAVI MUMBAI,0.0,2/8/16,18.0,1500.0,30.0
...,...,...,...,...,...,...,...,...
1048562,1990-08-04,M,NEW DELHI,0.0,18/9/16,18.0,500.0,30.0
1048563,1992-02-20,M,NASHIK,20000.0,18/9/16,18.0,0.0,30.0
1048564,1989-05-18,M,HYDERABAD,220000.0,18/9/16,18.0,500.0,30.0
1048565,1978-08-30,M,VISAKHAPATNAM,10000.0,18/9/16,18.0,1000.0,40.0


# 1.2 Fingerprint hashing

Implement your minhash function from scratch. No ready-made hash functions are allowed. Read the class material and search the internet if you need to. For reference, it may be practical to look at the description of hash functions in the book.

In [17]:
data2.dropna(inplace=True) #we remove all transactions with missing information.

In [18]:
#We define a function that generates an arbitrary number of permutations of an array provided as input.
#This function will be used to implement the minhash function.
def permutations(arr,n_perm):
    index=np.array([i for i in range(len(arr))])
    perms=[]
    for i in range(n_perm):
        perms.append(np.random.permutation(index))
    return perms

In [19]:
#This function is used to apply the minhash procedure to a single column
#and then iterate it to all the columns in the matrix with the same permutations.
def minhash(arr,perms):
    n_perm=len(perms)
    sign = np.zeros(n_perm+1)  #column initialized with all zeros
    i = 0
    #I see what position the first non-zero element is in
    while arr[i] == 0:
        i += 1
    sign[0] = i+1
    for i in range(n_perm):
        #Make the permutation
        for j in range(len(arr)):
            temp=arr[j]
            arr[j] = arr[perms[i][j]]
            arr[perms[i][j]]=temp
        k = 0
        #I see what position the first non-zero element is in
        while k!=len(arr) and arr[k] == 0:
            k += 1
        sign[i+1] = k+1
    return sign

Process the dataset and add each record to the MinHash. The subtask's goal is to try and map each consumer to its bin; to ensure this works well, be sure you understand how MinHash works and choose a matching threshold to use. Before moving on, experiment with different thresholds, explaining your choice.

In [20]:
#I delete duplicates from each column to get all the different options once
CustomerAge_norep=list(data2.CustomerAge.drop_duplicates())
CustGender_norep=['M','F']
CustAccountBalance_norep=list(data2.CustAccountBalance.drop_duplicates())
TransactionDate_norep=list(data2.TransactionDate.drop_duplicates())
TransactionTime_norep=list(data2.TransactionTime.drop_duplicates())
TransactionAmount_norep=list(data2.TransactionAmount.drop_duplicates())

In [27]:
#I create the shingles column
shi=[]
for i in  range(len(data2)):
    if i<len(CustomerAge_norep):
        shi.append(CustomerAge_norep[i])
    if i<len(CustGender_norep):
        shi.append(CustGender_norep[i]) 
    if i<len(CustAccountBalance_norep):
        shi.append(CustAccountBalance_norep[i]) 
    if i<len(TransactionDate_norep):
        shi.append(TransactionDate_norep[i]) 
    if i<len(TransactionTime_norep):
        shi.append(TransactionTime_norep[i]) 
    if i<len(TransactionAmount_norep):
        shi.append(TransactionAmount_norep[i])   

In [28]:
perms=permutations(shi,11)
sign_matrix=np.zeros([12,1048567])
for i in data2.index: #I repeat for all transactions
    #I create the column to apply the minhash function to using the shingles column
    a=[data2.CustomerAge[i], data2.CustGender[i], data2.CustAccountBalance[i], data2.TransactionDate[i], data2.TransactionTime[i], data2.TransactionAmount[i]]
    b=np.zeros(len(shi))
    for j in range(len(shi)):
        if shi[j] in a:
            b[j]=1
    c=minhash(b,perms) #I apply the minhash function
    #I insert the column in the signature matrix
    for j in range(12):
        sign_matrix[j][i]=c[j]

In [29]:
#Columns in which there was missing information are deleted
lista=[]
for i in range(1048567):
    if i not in data2.index:
        lista.append(i)
count=0
for i in lista:
    sign_matrix=np.delete(sign_matrix,i-count,1)
    count+=1

In [30]:
#I create an index for each column
n_tr=len(sign_matrix[1])
ind_tr=np.arange(n_tr)
sign_matrix=np.r_[[ind_tr],sign_matrix]
sign_matrix= sign_matrix.astype(int)

In [None]:
bands=[]
buckets=[[]] #list of lists representing the different buckets
for i in [0,3,6,9]: #band size = 3
    for j in range(n_tr):
        q=0
        for k in range(len(bands)):
            #Check if there is a band equal to the current one
            if [sign_matrix[1+i][j],sign_matrix[2+i][j],sign_matrix[3+i][j]]==bands[k]:
                buckets[k].append(sign_matrix[0][j]) #to put the transaction of the corresponding bucket
                q+=1 #indicates that an equal band was found
            if q==0: #if q==0 no band like the current one was found
                bands.append([sign_matrix[1+i][j],sign_matrix[2+i][j],sign_matrix[3+i][j]])

# 1.3 Locality Sensitive Hashing

In [74]:
query = pd.read_csv("query_users.csv")
query.dropna(inplace=True)

In [75]:
query

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,27/7/78,M,DELHI,94695.61,2/9/16,140310,65.0
1,6/11/92,M,PANCHKULA,7584.09,2/9/16,120214,6025.0
2,14/8/91,M,PATNA,7180.6,10/8/16,221732,541.5
3,3/1/87,M,CHENNAI,56847.75,29/8/16,144138,1000.0
4,4/1/95,M,GURGAON,84950.13,25/9/16,233309,80.0
5,10/1/81,M,WORLD TRADE CENTRE BANGALORE,23143.95,11/9/16,192906,303.0
6,20/9/76,F,CHITTOOR,15397.8,28/8/16,92633,20.0
7,10/4/91,M,MOHALI,426.3,2/8/16,203754,50.0
8,19/3/90,M,MOHALI,4609.34,26/8/16,184015,300.0
9,19/12/70,M,SERAMPORE,6695988.46,27/8/16,144030,299.0


In [76]:
#The query data is processed in the same way as the data in the Kaggle dataset
query['TransactionAmount']=query['TransactionAmount (INR)']
query = query.drop(['TransactionAmount (INR)'],axis=1)
query.CustomerDOB = pd.to_datetime(query.CustomerDOB)
query.loc[query.CustomerDOB.dt.year > 2000, "CustomerDOB"] = query.loc[query.CustomerDOB.dt.year > 2000, 'CustomerDOB'] - pd.DateOffset(years = 100)
query.drop(query[query.CustomerDOB.dt.year == 1800].index, axis = 0, inplace = True)
np.timedelta64(1, "Y")
query['CustomerAge'] = (( pd.to_datetime('today') - query.CustomerDOB ) / np.timedelta64(1, 'Y')).round(0)
query.TransactionAmount=query.TransactionAmount.apply(lambda x: x//500*500)
query.CustAccountBalance=query.CustAccountBalance.apply(lambda x: x//10000*10000)
query.TransactionTime=query.TransactionTime.apply(lambda x: round(x/10000,0))
query.CustomerAge=query.CustomerAge.apply(lambda x: x//5*5)

In [77]:
CustomerAge_norep_query=list(query.CustomerAge.drop_duplicates())
CustGender_norep_query=['M','F']
CustAccountBalance_norep_query=list(query.CustAccountBalance.drop_duplicates())
TransactionDate_norep_query=list(query.TransactionDate.drop_duplicates())
TransactionTime_norep_query=list(query.TransactionTime.drop_duplicates())
TransactionAmount_norep_query=list(query.TransactionAmount.drop_duplicates())

In [78]:
query #to visualize the dataset ready to be processed.

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount,CustomerAge
0,1978-07-27,M,DELHI,90000.0,2/9/16,14.0,0.0,40.0
1,1992-06-11,M,PANCHKULA,0.0,2/9/16,12.0,6000.0,30.0
2,1991-08-14,M,PATNA,0.0,10/8/16,22.0,500.0,30.0
3,1987-03-01,M,CHENNAI,50000.0,29/8/16,14.0,1000.0,35.0
4,1995-04-01,M,GURGAON,80000.0,25/9/16,23.0,0.0,25.0
5,1981-10-01,M,WORLD TRADE CENTRE BANGALORE,20000.0,11/9/16,19.0,0.0,40.0
6,1976-09-20,F,CHITTOOR,10000.0,28/8/16,9.0,0.0,45.0
7,1991-10-04,M,MOHALI,0.0,2/8/16,20.0,0.0,30.0
8,1990-03-19,M,MOHALI,0.0,26/8/16,18.0,0.0,30.0
9,1970-12-19,M,SERAMPORE,6690000.0,27/8/16,14.0,0.0,50.0


In [35]:
shi_query=[]
for i in  range(len(query)):
    if i<len(CustomerAge_norep_query):
        shi_query.append(CustomerAge_norep_query[i])
    if i<len(CustGender_norep_query):
        shi_query.append(CustGender_norep_query[i]) 
    if i<len(CustAccountBalance_norep_query):
        shi_query.append(CustAccountBalance_norep_query[i]) 
    if i<len(TransactionDate_norep_query):
        shi_query.append(TransactionDate_norep_query[i]) 
    if i<len(TransactionTime_norep_query):
        shi_query.append(TransactionTime_norep_query[i]) 
    if i<len(TransactionAmount_norep_query):
        shi_query.append(TransactionAmount_norep_query[i])   

In [65]:
#I proceed in the same way as in Kaggle's dataset.
perms=permutations(shi_query,5)
sign_matrix_query=np.zeros([6,50])
for i in query.index:
    a=[query.CustomerAge[i], query.CustGender[i], query.CustAccountBalance[i], query.TransactionDate[i], query.TransactionTime[i], query.TransactionAmount[i]]
    b=np.zeros(len(shi_query))
    for j in range(len(shi_query)):
        if shi_query[j] in a:
            b[j]=1
    c=minhash(b,perms)
    for j in range(6):
        sign_matrix_query[j][i]=c[j]

In [61]:
sign_matrix_query

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
       [ 1,  2,  2,  2,  2,  1,  6,  2,  2,  2,  2,  2,  1,  7,  2,  6,
         2,  2,  1,  8,  2,  2,  2,  2,  2,  2,  2,  2,  2,  5,  2,  6,
         2,  2,  2,  2,  2,  2,  1,  6,  2,  6,  2,  2,  6,  2],
       [ 3,  3,  3,  6,  3,  3,  3,  3,  3,  3,  3,  3,  5, 10,  1,  3,
         3,  3,  7,  1,  3, 12,  3, 10, 11,  3,  3, 25, 12, 25, 11,  3,
         8,  3, 10,  1,  3,  1, 23,  3,  3,  2,  3,  3,  3,  3],
       [ 3,  3,  3,  9,  3,  3,  3,  3,  3,  3,  1,  3, 18, 10,  3,  3,
         3,  3,  6,  9,  3,  2,  3, 10, 19,  3,  3,  9,  2,  9, 19,  3,
         2,  3, 10, 

In [66]:
lista=[]
for i in range(50):
    if i not in query.index:
        lista.append(i)
count=0
for i in lista:
    sign_matrix_query=np.delete(sign_matrix_query,i-count,1)
    count+=1

In [71]:
n_tr=46
ind_tr=np.arange(n_tr)
sign_matrix_query=np.r_[[ind_tr],sign_matrix_query]
sign_matrix_query= sign_matrix_query.astype(int)

In [72]:
bands_query=[[sign_matrix_query[1][0],sign_matrix_query[2][0]]]
buckets_query=[[]]
for i in [0,2,4]:
    for j in range(n_tr):
        q=0
        for k in range(len(bands_query)):
            if [sign_matrix_query[1+i][j],sign_matrix_query[2+i][j]]==bands_query[k]:
                buckets_query[k].append(sign_matrix_query[0][j])
                q+=1
            if q==0:
                bands_query.append([sign_matrix_query[1+i][j],sign_matrix_query[2+i][j]])

MemoryError: 