In [1]:
from langchain_together import TogetherEmbeddings
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('reviews_data.xlsx')

In [3]:
df

Unnamed: 0,review_id,customer_id,review_date,Review,Rating,review_date_numeric
0,11722,5380,2024-11-04,The room provided at first was a great disapp...,6.7,20241104
1,8604,8832,2024-03-01,The room wasn t so large for a Deluxe King bu...,9.6,20240301
2,3923,7301,2023-09-04,on arrival we had to wait over an hour after ...,7.1,20230904
3,10050,3968,2023-09-02,The room reminded me of a dorm room furnished...,7.9,20230902
4,2077,6085,2024-05-17,The room was very small the AC didn t work at...,8.8,20240517
...,...,...,...,...,...,...
8377,5928,1546,2023-04-30,Bathroom and toilet are small but spotlessly ...,10.0,20230430
8378,7439,4787,2024-01-30,Heating system in the room instructions on ho...,9.2,20240130
8379,339,4936,2024-10-11,Entering the hotel involved crossing an open ...,9.2,20241011
8380,6816,8542,2024-09-01,The hotel is very old Room are out of date an...,4.2,20240901


In [None]:
import os
import getpass
if not os.getenv("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = ""

# Initialize the TogetherEmbeddings model
embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval"  # Example model; check docs for Llama-based alternatives
)

In [5]:
## Create Embeddings 
reviews = df["Review"].tolist()

# Process embeddings in batches
embedding_list = []
for i in range(0, len(reviews), 128):
    batch = reviews[i : i + 128]  # Get batch
    batch_embeddings = embeddings.embed_documents(batch)  # Generate embeddings for batch
    embedding_list.extend(batch_embeddings)  # Store results
    print(f"Processed {i + len(batch)} / {len(reviews)} reviews")

Processed 128 / 8382 reviews
Processed 256 / 8382 reviews
Processed 384 / 8382 reviews
Processed 512 / 8382 reviews
Processed 640 / 8382 reviews
Processed 768 / 8382 reviews
Processed 896 / 8382 reviews
Processed 1024 / 8382 reviews
Processed 1152 / 8382 reviews
Processed 1280 / 8382 reviews
Processed 1408 / 8382 reviews
Processed 1536 / 8382 reviews
Processed 1664 / 8382 reviews
Processed 1792 / 8382 reviews
Processed 1920 / 8382 reviews
Processed 2048 / 8382 reviews
Processed 2176 / 8382 reviews
Processed 2304 / 8382 reviews
Processed 2432 / 8382 reviews
Processed 2560 / 8382 reviews
Processed 2688 / 8382 reviews
Processed 2816 / 8382 reviews
Processed 2944 / 8382 reviews
Processed 3072 / 8382 reviews
Processed 3200 / 8382 reviews
Processed 3328 / 8382 reviews
Processed 3456 / 8382 reviews
Processed 3584 / 8382 reviews
Processed 3712 / 8382 reviews
Processed 3840 / 8382 reviews
Processed 3968 / 8382 reviews
Processed 4096 / 8382 reviews
Processed 4224 / 8382 reviews
Processed 4352 / 

In [6]:
metadata_list = df.apply(lambda row: {
    "customer_id": int(row["customer_id"]),
    "review_date": row["review_date_numeric"],
    "Rating": int(row["Rating"]),
    "review_id":row['review_id']
}, axis=1).tolist()

In [7]:
metadata_list[0]

{'customer_id': 5380, 'review_date': 20241104, 'Rating': 6, 'review_id': 11722}

In [None]:
import os
os.environ["PINECONE_API_KEY"] = ""

In [29]:
import os
print(os.getenv("PINECONE_API_KEY"))  # Ensure it prints a valid key

pcsk_2VGEWR_GToh8JpFKqZipqezYuym65c5Y1okY3BneJetVqVHE3b1JDvCzQBg5bmmbyHSkJX


# Check whether any indexes already exist

In [30]:
import os
from pinecone import Pinecone

# Load API key from environment
api_key = os.getenv("PINECONE_API_KEY")

if not api_key:
    raise ValueError("PINECONE_API_KEY is not set. Please set it before running.")

# Create Pinecone client
pc = Pinecone(api_key=api_key)

# List all indexes
print(pc.list_indexes())

[{
    "name": "hotel-reviews",
    "metric": "cosine",
    "host": "hotel-reviews-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "enabled",
    "tags": null
}, {
    "name": "hotelreviews",
    "metric": "cosine",
    "host": "hotelreviews-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "enabled",
    "tags": null
}]


# create the index

In [31]:
from pinecone import Pinecone, ServerlessSpec
# Create an index
pc.create_index(
    name='hotelreview',
    dimension=768,  # Ensure this matches your embedding model
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "hotelreview",
    "metric": "cosine",
    "host": "hotelreview-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
index = pc.Index(host="")

In [33]:
vectors = [
    (str(i), embedding, metadata_list[i])
    for i, embedding in enumerate(embedding_list)
]

In [34]:
print(pc.list_indexes())  # Check if the index exists

[{
    "name": "hotelreview",
    "metric": "cosine",
    "host": "hotelreview-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "hotel-reviews",
    "metric": "cosine",
    "host": "hotel-reviews-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "enabled",
    "tags": null
}, {
    "name": "hotelreviews",
    "metric": "cosine",
    "host": "hotelreviews-778bu11.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
     

# Insert embeddings + metadata into the index

In [35]:
batch_size = 100  # Adjust this number as needed.

for i in range(0, len(embedding_list), batch_size):
    batch_vectors = [
        (str(i + j), embedding_list[i + j], metadata_list[i + j])
        for j in range(min(batch_size, len(embedding_list) - i))
    ]
    index.upsert(vectors=batch_vectors)
    print(f"Upserted batch from {i} to {i + len(batch_vectors)}")


Upserted batch from 0 to 100
Upserted batch from 100 to 200
Upserted batch from 200 to 300
Upserted batch from 300 to 400
Upserted batch from 400 to 500
Upserted batch from 500 to 600
Upserted batch from 600 to 700
Upserted batch from 700 to 800
Upserted batch from 800 to 900
Upserted batch from 900 to 1000
Upserted batch from 1000 to 1100
Upserted batch from 1100 to 1200
Upserted batch from 1200 to 1300
Upserted batch from 1300 to 1400
Upserted batch from 1400 to 1500
Upserted batch from 1500 to 1600
Upserted batch from 1600 to 1700
Upserted batch from 1700 to 1800
Upserted batch from 1800 to 1900
Upserted batch from 1900 to 2000
Upserted batch from 2000 to 2100
Upserted batch from 2100 to 2200
Upserted batch from 2200 to 2300
Upserted batch from 2300 to 2400
Upserted batch from 2400 to 2500
Upserted batch from 2500 to 2600
Upserted batch from 2600 to 2700
Upserted batch from 2700 to 2800
Upserted batch from 2800 to 2900
Upserted batch from 2900 to 3000
Upserted batch from 3000 to 310

# experiment with below parts

In [36]:
query_embedding = embeddings.embed_query("What are some of the reviews that mention restaurant, food, lunch, breakfast, dinner")

In [37]:
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="",
    include_metadata=True,
    filter={
        "Rating": {"$lte": 9},
        "review_date": {"$gte": 20240101, "$lte": 20240108}
    }
)

In [38]:
matches = results["matches"]

# Extract review_ids from the matches (convert them to int if necessary)
matched_ids = [int(match["metadata"]["review_id"]) for match in matches]

In [39]:
matched_ids

[5410, 8647, 11531, 2910, 2593]

In [40]:
req_df = df[df["review_id"].isin(matched_ids)]

In [41]:
req_df['Review']

486      Shower taps need descaling horrid smell from ...
1622     The people working on the terrace bar are pre...
2018     When we arrived to check in there was a misun...
5794     Having to pay for one item I used in the mini...
8249     Loved everything only negative was STK We wer...
Name: Review, dtype: object

In [42]:
concatenated_reviews = " ".join(req_df["Review"].tolist())

In [43]:
concatenated_reviews

' Shower taps need descaling horrid smell from drains the new booth seating is so unfriendly the older decor made more sense New menu card room dining not advisable limited choice of average food Tea cakes were cold from fridge so cream in the cakes was There were no more small tasty affordable snacks like the crab cakes of tandoori salmon or tatty etc Pity You lost 15 customers who went to eat elsewhere daily for 3 days   location  The people working on the terrace bar are pretty slow preparing beverages so you need to be patient awaiting your order especially when you can see quite a lot of people up there We were forced to cancel our order after 40 minutes waiting as we had another appointment planned after   The location of the hotel is just perfect 150m to Diagonal Metro station Rambla is just 300 m away So for our family tour with some sightseeing and enjoying good restaurants this was a perfect choice The hotel is very comfortable and quiet You can have rest on the terrace bar w

In [44]:
from together import Together
import os
client = Together()
response = client.chat.completions.create(
    model="meta-llama/Llama-Vision-Free",
    messages = [{"role": "user", "content": f"""Briefly Summarize the overall sentiment of customers about food and restaurant based on these reviews - {
    concatenated_reviews}. Dont mention the name of the hotel"""}]
)
print(response.choices[0].message.content)


Here's a brief summary of the overall sentiment of customers about food and restaurants based on the reviews:

**Negative comments:**

* Most restaurants had limited choices of average food.
* Food was not freshly prepared, e.g. tea cakes were cold and crab cakes were not available.
* Prices were considered extortionate.
* Staff were slow in preparing beverages at the bar.
* One restaurant (STK) was too loud and chaotic in the evening.

**Positive comments:**

* Some restaurants had great locations and views, e.g. the hotel's terrace bar had a breathtaking view of Barcelona.
* Staff at some restaurants were charming and helpful.
* Some restaurants had a good selection of snacks and drinks.
* One restaurant (Chico Latino bar) had a great atmosphere and a good singer on Saturday night.
* The breakfast at one hotel was excellent.

**Overall sentiment:**

The overall sentiment of customers about food and restaurants is mostly negative, with many expressing disappointment and frustration wi