# cs4400 final project
this solution is using Bernoulli Naive Bayes to determine if two items are 0(not match) or 1(match)

In [11]:
#%pip install pandas
import numpy as np
import pandas as pd
from os.path import join
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

In [12]:
#read data 
ltable = pd.read_csv(join('data', "ltable.csv"))
rtable = pd.read_csv(join('data', "rtable.csv"))
train = pd.read_csv(join('data', "train.csv"))

In [13]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [None]:
# training 
x_train=[]
y_train=[]
for i in range(train.shape[0]):
    ltable_id=train.loc[i]["ltable_id"]
    rtable_id=train.loc[i]["rtable_id"]
    label=int(train.loc[i]["label"])
    left_item=ltable.loc[ltable_id]
    right_item=rtable.loc[rtable_id]
    
    
    sim_title=levenshtein(str(left_item["title"]),str(right_item["title"]))
    sim_category=levenshtein(str(left_item["category"]),str(right_item["category"]))
    
    left_price=0 if np.isnan(left_item["price"]) else float(left_item["price"])
    
    right_price=0 if np.isnan(right_item["price"]) else float(right_item["price"])
    sim_price=np.abs(left_price-right_price)/(left_price+0.001)
    
    x_train.append([sim_title,sim_category,sim_price])
    y_train.append(label)

In [None]:
gnb = BernoulliNB()
gnb.fit(x_train, y_train)

In [None]:
#block by brand and model no 
ltable['brand'] = ltable['brand'].astype(str)
rtable['brand'] = rtable['brand'].astype(str)

brands_l = set(ltable["brand"].values)
brands_r = set(rtable["brand"].values)
brands = brands_l.union(brands_r)

brand2ids_l = {b.lower(): [] for b in brands}
brand2ids_r = {b.lower(): [] for b in brands}

for i, x in ltable.iterrows():
    brand2ids_l[x["brand"].lower()].append(x["id"])
for i, x in rtable.iterrows():
    brand2ids_r[x["brand"].lower()].append(x["id"])
    
candset = []
output_pair=[]

for brd in brands:
    l_ids = brand2ids_l[brd]
    r_ids = brand2ids_r[brd]
    for i in range(len(l_ids)):
        for j in range(len(r_ids)):
            modelno_l=ltable.loc[l_ids[i]]['modelno']
            modelno_r=rtable.loc[r_ids[j]]['modelno']
            
            modelno_l=str(modelno_l).lower()
            modelno_r=str(modelno_r).lower()    
            
            if(modelno_l!='nan' and modelno_r!='nan' and modelno_l==modelno_r):
                
                # if brand and model no match, add it to the output set
                output_pair.append([l_ids[i], r_ids[j]])
            else:    
                candset.append([l_ids[i], r_ids[j]])

In [None]:
#testing
x_test=[]
for ltable_id,rtable_id in candset:
    left_item=ltable.loc[ltable_id]
    right_item=rtable.loc[rtable_id]
    #print(right_item)
    
    sim_title=levenshtein(str(left_item["title"]),str(right_item["title"]))
    sim_category=levenshtein(str(left_item["category"]),str(right_item["category"]))
    
    left_price=0 if np.isnan(left_item["price"]) else float(left_item["price"])
    
    right_price=0 if np.isnan(right_item["price"]) else float(right_item["price"])
    sim_price=np.abs(left_price-right_price)/(left_price+0.001)
    
    x_test.append([sim_title,sim_category,sim_price])

In [None]:
y=gnb.predict(x_test)
for i in range(len(y)):
    #print(y[i])
    if y[i]==1:
        print(candset[i])
        output_pair.append(candset[i])

In [None]:
output_pair=pd.DataFrame(output_pair)     
output_pair = set(map(tuple, output_pair.values))
#print(output_pair)
output=pd.DataFrame(output_pair, columns=["ltable_id", "rtable_id"])
output.to_csv("output.csv", index=False)