In [1]:
# !pip install faiss-cpu
# !pip install sentence-transformers

In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from nervaluate import Evaluator
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import os
import faiss

2023-12-11 21:34:13 faiss.loader INFO: Loading faiss.
2023-12-11 21:34:13 faiss.loader INFO: Successfully loaded faiss.


In [3]:
categories = ["Grocery and Supermarkets", "Restaurants and Food Services", "Clothing and Apparel", 
              "Health and Beauty", "Electronics and Appliances", "Home and Garden", "Entertainment and Leisure"]

def get_training_data():
    folder_path = 'training_data/'
    category_train_test_data = {}
    category_training_items = []
    
    # List all files in the folder
    files = os.listdir(folder_path)
    i = 0

    # Gets the items for each category.
    for category in categories:
        # Gets the file identifier for the category.
        category_id = (category.split(" ")[0]).lower()
        file_name = f'{category_id}_items.txt'
        file_path = os.path.join(folder_path, file_name)
    
        # Check if it's a text file.
        if file_name.endswith('.txt'):
            with open(file_path, 'r') as file:
                # Read the content and split into list by newline character.
                text_content = file.read().split('\n')

                for item in text_content:
                    if 'items' in category_train_test_data:
                        # Adds the items to the category in the dict.
                        category_train_test_data['items'].append(item)
                    else:
                        category_train_test_data['items'] = []
                        category_train_test_data['items'].append(item)

                    if 'category' in category_train_test_data:
                        category_train_test_data['category'].append(category)
                    else:
                        category_train_test_data['category'] = []
                        category_train_test_data['category'].append(category)
                i += 1
    return category_train_test_data

category_data = pd.DataFrame(get_training_data())
category_data

Unnamed: 0,items,category
0,Milk,Grocery and Supermarkets
1,Bread,Grocery and Supermarkets
2,Eggs,Grocery and Supermarkets
3,Bananas,Grocery and Supermarkets
4,Chicken breasts,Grocery and Supermarkets
...,...,...
2622,Cinema Membership Exclusive Event Invitation R...,Entertainment and Leisure
2623,Paint and Sip Canvas Easel Setup Instructions ...,Entertainment and Leisure
2624,Musical Instrument Tuning Fork Calibration Gui...,Entertainment and Leisure
2625,Popcorn Kernel Seasoning Recipe Book Ingredien...,Entertainment and Leisure


In [4]:
# Randomly shuffles the rows.
shuffled_data = category_data.sample(frac=1)

total_rows = shuffled_data.shape[0]
train_size = int(total_rows*0.8)
 
# Split data into test and train
train = shuffled_data[0:train_size]
test = shuffled_data[train_size:]

train

Unnamed: 0,items,category
1481,Aloe vera plant,Health and Beauty
1215,Crochet shorts,Clothing and Apparel
1468,Hair mask,Health and Beauty
1865,Lawn Dethatcher,Home and Garden
959,Knee-high socks,Clothing and Apparel
...,...,...
1896,Garden Hose Extension,Home and Garden
2430,Streaming Service Merchandise,Entertainment and Leisure
342,Stop & Shop,Grocery and Supermarkets
851,Sweatshirt,Clothing and Apparel


In [9]:
from sentence_transformers import SentenceTransformer
train_items = list(train['items'])
model = SentenceTransformer('bert-base-nli-mean-tokens')
train_vectors = model.encode(train_items)
train_vectors

2023-12-11 21:34:49 sentence_transformers.SentenceTransformer INFO: Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
2023-12-11 21:34:50 sentence_transformers.SentenceTransformer INFO: Use pytorch device: cpu


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

array([[-0.02505693, -0.47903273,  1.1605906 , ...,  0.4763379 ,
        -0.43956566, -0.19242871],
       [ 0.47897112, -0.51159513,  0.554633  , ...,  0.89066434,
         0.45035455, -0.25843236],
       [ 0.8232161 , -0.43555373,  1.1320732 , ..., -0.8087599 ,
        -0.55057395,  0.0295518 ],
       ...,
       [ 0.40857196,  0.4659235 ,  1.9242964 , ...,  0.4456256 ,
        -0.09863766, -0.5388247 ],
       [ 0.568075  , -0.3537699 ,  1.28238   , ...,  0.7724934 ,
         0.09810871,  0.20657595],
       [-0.09512617, -0.3767157 ,  1.3157302 , ...,  0.25072765,
        -0.18039457,  0.12894577]], dtype=float32)

In [None]:
import faiss

vector_shape = train_vectors.shape[1]
index = faiss.IndexFlatL2(vector_shape)
faiss.normalize_L2(train_vectors)
index.add(train_vectors)

In [None]:
test_items = list(test['items'])
test_vectors = model.encode(test_items)
test_vectors

In [None]:
k = index.ntotal

for vector in test_vectors:
    distances, ann = index.search(vector, k=k)