# Storing Data pada MongoDB Atlas

## Load Dataset

In [1]:
import pandas as pd
df = pd.read_pickle('../data/datasets.pkl')

In [2]:
df.head()

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1,gender,material,category
69,Buxton Heiress Pik-Me-Up Framed Case,Buxton,"['Leather', 'Imported', 'synthetic lining', 'F...","43,930inClothing,Shoesamp;Jewelry(",5 star,B00007GDFV,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['Authentic crunch leather with rich floral em...,16.95,"[B07C9V84JD, B01J6JE05G, B07J11WZ5Y, B07JJQFHS...","[B07C9V84JD, B01J6JE05G, B07JJQFHS5, B003EGITU...","class=""a-normal a-align-center a-spacing-smal...",,,,female,Leather,Accessories
352,Disguise Tiny Treats Pink Leopard,,"['polyester', 'You can return this item for an...","5,843,505inClothing,ShoesJewelry(",5 star,B0002C6NL6,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['A grrreat pink jumpsuit with attached tail a...,25.99,,,,,,,female,Polyester,Dresses/Jumpsuits
410,Dream PJ's Blue - Large - Part #: 25BLG,Ethical/Spot,['Product Dimensions:\n \n8...,"17,183,425inClothing,Shoesamp;Jewelry(",5 star,B0002TKBSU,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,"['SOFT AND CUDDLY, SWEET DREAM PAJAMAS IN SOFT...",15.99,,,,,,,unisex,Unknown,Undergarments/Sleepwear
512,Buxton Heiress Pik-Me-Up Framed Case,Buxton,"['Leather', 'Imported', 'synthetic lining', 'F...","43,930inClothing,Shoesamp;Jewelry(",5 star,B00007GDFV,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['Authentic crunch leather with rich floral em...,16.95,"[B07C9V84JD, B01J6JE05G, B07J11WZ5Y, B07JJQFHS...","[B07C9V84JD, B01J6JE05G, B07JJQFHS5, B003EGITU...","class=""a-normal a-align-center a-spacing-smal...",,,,female,Leather,Accessories
795,Disguise Tiny Treats Pink Leopard,,"['polyester', 'You can return this item for an...","5,843,505inClothing,ShoesJewelry(",5 star,B0002C6NL6,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['A grrreat pink jumpsuit with attached tail a...,25.99,,,,,,,female,Polyester,Dresses/Jumpsuits


## Building Dataset

In [3]:
from haystack import Document
documents = []
for index, row in df.iterrows():
    descriptions = row["description"].strip("[]").strip("''")

    doc = Document(
        content = f"{row['title']}\n {descriptions}",
        meta = {
            'asin': row['asin'],
            'title': row['title'],
            'brand': row['brand'],
            'price': row['price'],
            'gender': row['gender'],
            'material': row['material'],
            'category': row['category'],
        }
    )
    documents.append(doc)

In [5]:
documents[0]

Document(id=bd174fb86972d98caf70c4784fb24ef902812b76b05fde64b1b9c3874702f92c, content: 'Buxton Heiress Pik-Me-Up Framed Case
 Authentic crunch leather with rich floral embossed logo heires...', meta: {'asin': 'B00007GDFV', 'title': 'Buxton Heiress Pik-Me-Up Framed Case', 'brand': 'Buxton', 'price': 16.95, 'gender': 'female', 'material': 'Leather', 'category': 'Accessories'})

## Membuat Storing Pipeline

In [None]:
import os
from getpass import getpass
os.environ["MONGO_CONNECTION_STRING"] = getpass("Masukkan MongoDB Connection String Anda: ")

In [18]:
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
pipeline_storing = Pipeline()

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
document_store = MongoDBAtlasDocumentStore(
    database_name="depato_store",
    collection_name="products",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

In [25]:
pipeline = Pipeline()
pipeline.add_component("embedder",SentenceTransformersDocumentEmbedder())
pipeline.add_component("writer",DocumentWriter(document_store=document_store,policy=DuplicatePolicy.OVERWRITE))

pipeline.connect("embedder","writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x000001DADB6179B0>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

In [26]:
pipeline.run({
    "embedder":{
        "documents":documents
    }
})

Batches: 100%|██████████| 40/40 [01:16<00:00,  1.92s/it]


{'writer': {'documents_written': 1262}}

## Menyimpan Category dan Material di koleksi 

In [5]:
from pymongo import MongoClient
import os
client = MongoClient(os.environ['MONGO_CONNECTION_STRING'])
db = client.depato_store
material_collection = db.materials
category_collection = db.categories

In [6]:
materials = df['material'].unique().tolist()
categories = df['category'].unique().tolist()

In [8]:
documents_material= [ {"name":m} for m in materials]
documents_category = [ {"name":c} for c in categories]

In [10]:
material_collection.insert_many(documents_material)
category_collection.insert_many(documents_category)

InsertManyResult([ObjectId('688957153021d83d1fb416dd'), ObjectId('688957153021d83d1fb416de'), ObjectId('688957153021d83d1fb416df'), ObjectId('688957153021d83d1fb416e0'), ObjectId('688957153021d83d1fb416e1'), ObjectId('688957153021d83d1fb416e2'), ObjectId('688957153021d83d1fb416e3'), ObjectId('688957153021d83d1fb416e4'), ObjectId('688957153021d83d1fb416e5'), ObjectId('688957153021d83d1fb416e6')], acknowledged=True)