In [1]:
!pip install qdrant_client
!pip install embetter

Collecting qdrant_client
  Downloading qdrant_client-1.12.1-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant_client)
  Downloading grpcio_tools-1.68.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant_client)
  Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant_client)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant_client)
  Downloading hyperframe-6.0.1-py3-none-any.whl.metadata (2.7 kB)
Collecting hpack<5,>=4.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant_client)
  Downloading hpack-4.0.0-py3-none-any.whl.metadata (2.5 kB)
Downloading qdrant_client-1.12.1-py3-none-

In [44]:
import pandas as pd

dataset_df = pd.read_csv('/content/Updated_Hotel_Search_Dataset.csv')

In [45]:
dataset_df.head()

Unnamed: 0,Hotel Name,Description,Amenities,Location,Reviews
0,Hotel 1,Ideal for romantic getaways or family vacations.,Room Service,Manhattan,Amazing views from the balcony.
1,Hotel 2,Ideal for romantic getaways or family vacations.,Swimming Pool,Chi-Town,Excellent service and friendly staff.
2,Hotel 3,A peaceful retreat in the bustling city.,Beach Access,Seattle,Rooms are spacious and well-maintained.
3,Hotel 4,"Located in the heart of the city, close to maj...",Beach Access,San Diego,"The hotel is pet-friendly, which was a bonus."
4,Hotel 5,A boutique hotel with a personal touch.,"Spa, Free Wi-Fi, Beach Access, Business Center...",Portland,"The hotel is pet-friendly, which was a bonus."


In [46]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

qdrant_client = QdrantClient(
    url="<YOUR QDRANT URI>",
    api_key="<YOUR QDRANT API KEY>",
)

In [47]:
qdrant_client.create_collection(
    collection_name="hotel_search_multivector",
    vectors_config={
        "description": VectorParams(
            size=384,
            distance=Distance.COSINE,
        ),
        "amenities": VectorParams(
            size=384,
            distance=Distance.COSINE,
        ),
        "location": VectorParams(
            size=384,
            distance=Distance.COSINE,
        ),
        "review": VectorParams(
            size=384,
            distance=Distance.COSINE,
        )
    },
)

True

In [48]:
from sklearn.pipeline import make_pipeline
from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder


desc_pipeline = make_pipeline(
    ColumnGrabber("Description"),
    SentenceEncoder("all-MiniLM-L6-v2"),
)

amn_pipeline = make_pipeline(
    ColumnGrabber("Amenities"),
    SentenceEncoder("all-MiniLM-L6-v2"),
)

loc_pipeline = make_pipeline(
    ColumnGrabber("Location"),
    SentenceEncoder("all-MiniLM-L6-v2"),
)

rev_pipeline = make_pipeline(
    ColumnGrabber("Reviews"),
    SentenceEncoder("all-MiniLM-L6-v2"),
)

In [49]:
desc_vectors = desc_pipeline.transform(dataset_df)
amn_vectors = amn_pipeline.transform(dataset_df)
loc_vectors = loc_pipeline.transform(dataset_df)
rev_vectors = rev_pipeline.transform(dataset_df)

dataset_df["desc_vectors"] = desc_vectors.tolist()
dataset_df["amn_vectors"] = amn_vectors.tolist()
dataset_df["loc_vectors"] = loc_vectors.tolist()
dataset_df["rev_vectors"] = rev_vectors.tolist()

In [50]:
dataset_df.head(1)

Unnamed: 0,Hotel Name,Description,Amenities,Location,Reviews,desc_vectors,amn_vectors,loc_vectors,rev_vectors
0,Hotel 1,Ideal for romantic getaways or family vacations.,Room Service,Manhattan,Amazing views from the balcony.,"[0.005621132906526327, 0.0218807402998209, 0.0...","[-0.06589069962501526, -0.026376377791166306, ...","[0.10056295990943909, 0.001548216911032796, 0....","[-0.02395600639283657, 0.0337023101747036, -0...."


In [51]:
from qdrant_client.http.models import PointStruct

for index, row in dataset_df.iterrows():
    print(row[r"rev_vectors"])
    qdrant_client.upsert(
        collection_name="hotel_search_multivector",
        points=[
            PointStruct(
                id=index,
                vector={
                    "description": row["desc_vectors"],
                    "amenities": row["amn_vectors"],
                    "location": row["loc_vectors"],
                },
                payload={
                    "Name": row["Hotel Name"],
                    "Description": row["Description"],
                    "Amenities": row["Amenities"],
                    "Location": row["Location"],
                }
            )
        ]
    )

[-0.02395600639283657, 0.0337023101747036, -0.0207272507250309, -0.026062622666358948, 0.01218036375939846, 0.028512906283140182, 0.041669733822345734, -0.027777232229709625, 0.03709757328033447, -0.01607411727309227, 0.02261510118842125, 0.04414726421236992, 8.920468098949641e-05, 0.01583913154900074, -0.0036964609753340483, 0.03956787660717964, 0.09285400062799454, 0.008306358009576797, -0.01912713050842285, 0.0342588871717453, -0.08007564395666122, -0.009309183806180954, 0.015191002748906612, 0.015069429762661457, 0.004536316264420748, 0.06392951309680939, 0.03740768879652023, 0.05818554759025574, 0.0686747282743454, -0.027120225131511688, 0.03951377421617508, 0.030835328623652458, -0.004197307396680117, 0.04727711156010628, -0.01630573906004429, -0.0036266851238906384, 0.0020842005033046007, -0.08413588255643845, 0.03241756558418274, 0.018096640706062317, -0.04635900259017944, 0.04002290219068527, -0.008075490593910217, 0.02807041071355343, -0.0092786755412817, -0.02328133024275303

In [74]:
## Somehow breakdown user query into three fields - Description, Amenities, Location

desc_query = "clean rooms with a good scenic view"
amn_query = "allows pet and good internet"
loc_query = "San Francisco"

In [75]:
from qdrant_client.http.models import NamedVector
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")

loc_query = "San Francisco"

hits = qdrant_client.search(
    collection_name="hotel_search_multivector",
    query_vector=NamedVector(name="location",
                             vector=encoder.encode(str(loc_query)).tolist(),),
    limit=5,
)

loc_filters = []
for hit in hits:
    print(f"Chunk id:{hit.id}",hit.payload['Location'], "score:", hit.score)
    loc_filters.append(hit.payload['Location'])

Chunk id:23 SF Bay score: 0.83436835
Chunk id:34 SF score: 0.81051254
Chunk id:28 San Jose score: 0.7518743
Chunk id:26 San Jose score: 0.7518743
Chunk id:37 San Jose score: 0.7518743


In [76]:
loc_filters

['SF Bay', 'SF', 'San Jose', 'San Jose', 'San Jose']

In [77]:
from qdrant_client import models

amn_query = "allows pet and good internet"

scroll_filter=models.Filter(
    must=[
        models.FieldCondition(
        key="Location",
        match=models.MatchAny(any=['SF Bay', 'SF', 'San Jose', 'San Jose', 'San Jose']),
    ),
    ],
)

hits = qdrant_client.search(
    collection_name="hotel_search_multivector",
    query_vector=NamedVector(name="amenities",
                             vector=encoder.encode(str(amn_query)).tolist(),),
    limit=3,
    query_filter = scroll_filter
)

amn_filters = []
for hit in hits:
    print(f"Chunk id:{hit.id}",hit.payload["Amenities"], "score:", hit.score)
    amn_filters.append(hit.payload["Amenities"])

Chunk id:26 Pet Friendly score: 0.55064887
Chunk id:34 Pet Friendly, Beach Access, Laundry Service score: 0.4203567
Chunk id:28 Free Parking, Laundry Service, Free Wi-Fi, Beach Access score: 0.34341657


In [79]:
from qdrant_client import models

desc_query = "clean rooms with a good scenic view"

scroll_filter=models.Filter(
    must=[
        models.FieldCondition(
        key="Amenities",
        match=models.MatchAny(any=amn_filters),
    ),
    ],
)

hits = qdrant_client.search(
    collection_name="hotel_search_multivector",
    query_vector=NamedVector(name="description",
                             vector=encoder.encode(str(desc_query)).tolist(),),
    limit=3,
    query_filter = scroll_filter
)

for hit in hits:
    print(f"Chunk id:{hit.id}",hit.payload, "score:", hit.score)

Chunk id:34 {'Name': 'Hotel 35', 'Description': 'A luxurious stay with stunning city views.', 'Amenities': 'Pet Friendly, Beach Access, Laundry Service', 'Location': 'SF'} score: 0.49673396
Chunk id:26 {'Name': 'Hotel 27', 'Description': 'A boutique hotel with a personal touch.', 'Amenities': 'Pet Friendly', 'Location': 'San Jose'} score: 0.435123
Chunk id:28 {'Name': 'Hotel 29', 'Description': 'A boutique hotel with a personal touch.', 'Amenities': 'Free Parking, Laundry Service, Free Wi-Fi, Beach Access', 'Location': 'San Jose'} score: 0.435123
