In [None]:
import pandas as pd
import numpy as np

In [None]:
matched_sample = pd.read_excel('./dataset/MATCHED SAMPLE.xlsx')
matched_sample.head()

In [None]:
matched_sample.columns

In [None]:
seller_master = pd.read_csv('./dataset/Seller Master.csv', on_bad_lines='skip', sep=';')

In [None]:
seller_master.head()

In [None]:
seller_master.columns

In [None]:
marketplace_listings = pd.read_excel('./dataset/Marketplace Listings G2.xlsx')

In [None]:
marketplace_listings.columns

In [None]:
marketplace_listings.head()

In [None]:
# concat ListingDescription1, ListingDescription2, and ListingDescription3
marketplace_listings['Description'] = marketplace_listings['ListingDescription1'] + marketplace_listings['ListingDescription2'] + marketplace_listings['ListingDescription3']

marketplace_listings.head()

In [None]:
seller_master_df = seller_master[['SellerId', 'SellerName', 'SellerAlias', 'SellerUrl', 'SellerDescription']]

In [None]:
seller_master_df.head(10000)

In [None]:
seller_master_df.size

In [None]:
seller_master_df.dropna(subset=['SellerId'], inplace=True)
seller_master_df['SellerName'] = seller_master_df['SellerName'].astype('str')
seller_master_df['SellerAlias'] = seller_master_df['SellerName'].astype('str')
seller_master_df['SellerUrl'] = seller_master_df['SellerUrl'].astype('str')
seller_master_df['SellerDescription'] = seller_master_df['SellerDescription'].astype('str')

In [None]:
seller_master_df.size

In [None]:
seller_master_df.drop_duplicates(subset=['SellerId'], inplace=True)
seller_master_df.size

In [None]:
seller_master_df.head(651)

In [None]:
train_records = 10000

In [None]:
import re

def preprocess_url(url):
    try:
        return url.split('//')[1].split('/')[0]
    except:
        return ''

def preprocess_name(name):
    cleaned_name = name
    cleaned_name = cleaned_name.replace(' ', '-')
    cleaned_name = cleaned_name.replace("%", "amp")
    cleaned_name = cleaned_name.replace('.', 'dot')
    cleaned_name = cleaned_name.replace('(', 'open-bracket-')
    cleaned_name = cleaned_name.replace(')', '-close-bracket')
    
    return cleaned_name

In [None]:
# Build sentence list for the first 10000 records
seller_name_list = seller_master_df['SellerName'].head(train_records).tolist()
seller_alias_list = seller_master_df['SellerAlias'].head(train_records).tolist()
seller_url_list = seller_master_df['SellerUrl'].head(train_records).tolist()
seller_description_list = seller_master_df['SellerDescription'].head(train_records).tolist()

seller_sentence_list = pd.concat([
    pd.Series([preprocess_name(name) for name in seller_name_list]),
    pd.Series([preprocess_name(name) for name in seller_alias_list]),
    pd.Series([preprocess_url(url) for url in seller_url_list]),
    # pd.Series(seller_description_list)
]).drop_duplicates().to_list()

In [None]:
seller_sentence_list

In [None]:
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

data = []

# iterate through each sentence in the file
for sentence in seller_sentence_list:
      temp = []
      for word in word_tokenize(sentence):
            temp.append(word)
      
      data.append(temp)

# Create CBOW model
model1 = Word2Vec(data, min_count=1, vector_size=300, window=5)

model1.wv.save_word2vec_format("./model/word2vec.txt")


In [None]:
!gzip ./model/word2vec.txt -f

In [None]:
!python -m spacy init vectors en ./model/word2vec.txt.gz output/

In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("./output")

# Preprocess textual features and build index
index_to_id_map = {}
index_count = 0

# Split data into batches
batch_size = 1000
num_batches = -(-len(seller_master_df) // batch_size)  # Calculate number of batches rounding up
batches = [seller_master_df[i*batch_size:(i+1)*batch_size] for i in range(10)]

ann_idx = 0

# Batch processing function to generate embeddings
def process_batch(batch):
    global ann_idx

    embeddings = []

    for i, row in batch.iterrows():
        seller_id = row['SellerId']
        
        name = preprocess_name(row['SellerName'])
        alias = preprocess_name(row['SellerAlias'])
        url = preprocess_url(row['SellerUrl'])
        
        texts = [name, alias] if name != alias else [name]
        if url != "":
            texts.append(url)
        
        for text in texts:
            doc = nlp(text)
            if np.all(doc.vector == 0):
                print(f"Zero vector found for text: {text}")
                continue
    
            embeddings.append((ann_idx, doc.vector))
            index_to_id_map[ann_idx] = seller_id
            ann_idx += 1
    
    return embeddings

emb_batch = [process_batch(batch) for batch in batches]

In [None]:
from annoy import AnnoyIndex

ann_index = AnnoyIndex(300, 'angular')  # Assuming we are using 300-dimensional embeddings from spaCy
ann_index.verbose(True)

for embeddings in emb_batch:
    for index, emb in embeddings:
        ann_index.add_item(index, emb)

ann_index.build(10)
ann_index.save('./model/seller_master.ann')


In [None]:

# save index to seller id map
pd.DataFrame(index_to_id_map.items(), columns=['index', 'SellerId']).to_csv('./model/seller_id_index_map.csv', index=False)

In [None]:
ann_index = AnnoyIndex(300, 'angular')
ann_index.load('./model/seller_master.ann')

index_to_id_map = pd.read_csv('./model/seller_id_index_map.csv').set_index('index').to_dict()['SellerId']

# Function to find the most similar sellerId for a given query
def find_similar_seller(query_name, query_url, query_desc):
    query_name_emb = nlp(preprocess_name(query_name)).vector
    query_url_emb = nlp(preprocess_url(query_url)).vector
    # query_desc_emb = nlp(query_desc).vector
    
    # find the best score
    best_score = -1
    best_index = -1
    for emb in [query_name_emb, query_url_emb]:
        index = ann_index.get_nns_by_vector(emb, 1)[0]
        score = np.dot(emb, ann_index.get_item_vector(index))
        
        if score > best_score:
            best_score = score
            best_index = index
    
    return index_to_id_map[best_index]
print(find_similar_seller('0X0CAT', 'https://www.0x0.cat', ''))

In [None]:
seller_master_df.head(651)