In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Run this cell first
!pip install langchain langchain_community langgraph scikit-learn surprise



In [5]:
# Reinstall compatible versions together
!pip install google-generativeai google-ai-generativelanguage langchain-google-genai



In [6]:
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
gemini_key = user_secrets.get_secret("GEMINI_API_KEY")  # name must match exactly in Kaggle Secrets
os.environ["GEMINI_API_KEY"] = gemini_key

In [7]:
print("Gemini key loaded?", "GEMINI_API_KEY" in os.environ)

Gemini key loaded? True


In [29]:
import os, requests, json
key = os.environ.get("GEMINI_API_KEY")
if not key:
    raise RuntimeError("GEMINI_API_KEY not in os.environ — load from kaggle_secrets first.")

r = requests.get(
    "https://generativelanguage.googleapis.com/v1/models",
    params={"key": key}
)
print("HTTP", r.status_code)
print(json.dumps(r.json(), indent=2))


HTTP 200
{
  "models": [
    {
      "name": "models/gemini-1.5-pro-002",
      "version": "002",
      "displayName": "Gemini 1.5 Pro 002",
      "description": "Stable version of Gemini 1.5 Pro, our mid-size multimodal model that supports up to 2 million tokens, released in September of 2024.",
      "inputTokenLimit": 2000000,
      "outputTokenLimit": 8192,
      "supportedGenerationMethods": [
        "generateContent",
        "countTokens",
        "createCachedContent"
      ],
      "temperature": 1,
      "topP": 0.95,
      "topK": 40,
      "maxTemperature": 2
    },
    {
      "name": "models/gemini-1.5-pro",
      "version": "001",
      "displayName": "Gemini 1.5 Pro",
      "description": "Stable version of Gemini 1.5 Pro, our mid-size multimodal model that supports up to 2 million tokens, released in May of 2024.",
      "inputTokenLimit": 2000000,
      "outputTokenLimit": 8192,
      "supportedGenerationMethods": [
        "generateContent",
        "countTokens"
  

In [14]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain.llms import OpenAI
from langchain import LLMChain, PromptTemplate
from typing import List, Dict, Tuple

# your ContentBasedRecommender class (adapted to match column names of preprocessed file)
class ContentBasedRecommender:
    def __init__(self):
        self.hotels_df = None
        # keep weights if you want to later weight components (not currently used in vector building)
        self.feature_weights = {
            'amenities': 0.35,
            'location': 0.25,
            'price': 0.20,
            'star_rating': 0.15,
            'property_type': 0.05
        }
        # Amenity column names — adapt to the preprocessed CSV names (prefix "amenity_")
        self.amenity_features = [
            'amenity_free_wifi', 'amenity_air_conditioning', 'amenity_parking', 'amenity_room_service',
            'amenity_24_7_front_desk', 'amenity_restaurant', 'amenity_gym', 'amenity_pool', 'amenity_spa',
            'amenity_business_center', 'amenity_conference_hall', 'amenity_airport_shuttle',
            'amenity_meeting_rooms', 'amenity_vegetarian_restaurant'  # if exists
        ]
        self.scaler = MinMaxScaler()

    def load_data(self, hotels_df: pd.DataFrame):
        self.hotels_df = hotels_df.copy()
        # cast amenity booleans to integers (handle missing amenities)
        for amenity in self.amenity_features:
            if amenity in self.hotels_df.columns:
                # some CSVs use 0/1 or True/False strings — coerce
                self.hotels_df[amenity] = self.hotels_df[amenity].fillna(0).astype(int)
            else:
                # add missing amenity column as zeros to keep vector lengths consistent
                self.hotels_df[amenity] = 0

        # create location dummies using 'city' or 'location' column if available
        city_col = 'city' if 'city' in self.hotels_df.columns else ('location' if 'location' in self.hotels_df.columns else None)
        if city_col is not None:
            location_dummies = pd.get_dummies(self.hotels_df[city_col].fillna('Unknown'), prefix='city')
            self.hotels_df = pd.concat([self.hotels_df, location_dummies], axis=1)
        else:
            location_dummies = pd.DataFrame(index=self.hotels_df.index)

        # Normalize numeric columns for price and rating (watch column names)
        price_col = 'price_per_night' if 'price_per_night' in self.hotels_df.columns else ('price_per_night_inr' if 'price_per_night_inr' in self.hotels_df.columns else 'price_per_night_inr')
        rating_col = 'star_rating' if 'star_rating' in self.hotels_df.columns else 'star_rating_clean' if 'star_rating_clean' in self.hotels_df.columns else None

        # Ensure numeric columns exist and fillna
        if price_col in self.hotels_df.columns:
            self.hotels_df[price_col] = pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(self.hotels_df[price_col].median())
            self.hotels_df['price_normalized'] = self.scaler.fit_transform(self.hotels_df[[price_col]])
        else:
            self.hotels_df['price_normalized'] = 0.0

        if rating_col:
            self.hotels_df[rating_col] = pd.to_numeric(self.hotels_df[rating_col], errors='coerce').fillna(self.hotels_df[rating_col].median())
            self.hotels_df['rating_normalized'] = self.scaler.fit_transform(self.hotels_df[[rating_col]])
        else:
            self.hotels_df['rating_normalized'] = 0.0

        # property type dummies if present
        property_col = 'hotel_type' if 'hotel_type' in self.hotels_df.columns else ('property_type' if 'property_type' in self.hotels_df.columns else None)
        if property_col:
            property_dummies = pd.get_dummies(self.hotels_df[property_col].fillna('Unknown'), prefix='property')
            self.hotels_df = pd.concat([self.hotels_df, property_dummies], axis=1)
        else:
            property_dummies = pd.DataFrame(index=self.hotels_df.index)

        # assemble feature columns list
        self.feature_columns = (
            self.amenity_features +
            list(location_dummies.columns) +
            ['price_normalized', 'rating_normalized'] +
            list(property_dummies.columns)
        )

        # Build combined feature vector (as numpy array)
        # ensure all feature columns present (if new dummies missing, add zeros)
        for col in self.feature_columns:
            if col not in self.hotels_df.columns:
                self.hotels_df[col] = 0

        self.hotels_df['feature_vector'] = self.hotels_df[self.feature_columns].values.tolist()
        return self

    def _create_user_profile(self, user_preferences: Dict):
        user_vector = np.zeros(len(self.feature_columns))
        feature_idx_map = {feature: idx for idx, feature in enumerate(self.feature_columns)}

        # city
        if 'city' in user_preferences:
            city_feature = f"city_{user_preferences['city']}"
            if city_feature in feature_idx_map:
                user_vector[feature_idx_map[city_feature]] = 1.0

        # preferred_amenities may be JSON string or list
        if 'preferred_amenities' in user_preferences and user_preferences['preferred_amenities']:
            pref = user_preferences['preferred_amenities']
            if isinstance(pref, str):
                try:
                    preferred_amenities = json.loads(pref)
                except Exception:
                    preferred_amenities = [pref]
            else:
                preferred_amenities = pref
            for amenity in preferred_amenities:
                # amenity names in user_profiles may be like "pool" whereas our features are "amenity_pool"
                col_name = amenity if amenity.startswith('amenity_') else f'amenity_{amenity}'
                if col_name in feature_idx_map:
                    user_vector[feature_idx_map[col_name]] = 1.0

        # budget influence (approx)
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            # put 1 on price_normalized if within range
            price_norm_idx = feature_idx_map.get('price_normalized')
            if price_norm_idx is not None:
                user_vector[price_norm_idx] = 1.0

        return user_vector

    def calculate_similarity(self, user_preferences: Dict, top_n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
        if self.hotels_df is None:
            raise ValueError("Load data first")
        user_vector = self._create_user_profile(user_preferences)
        hotel_vectors = np.array(self.hotels_df['feature_vector'].tolist())
        sims = cosine_similarity([user_vector], hotel_vectors)[0]
        filtered_indices = self._apply_filters(user_preferences)
        filtered_sims = sims[filtered_indices]
        top_indices = filtered_indices[np.argsort(filtered_sims)[-top_n:][::-1]]
        return top_indices, sims[top_indices]

    def _apply_filters(self, user_preferences: Dict):
        mask = pd.Series(True, index=self.hotels_df.index)
        if 'city' in user_preferences:
            city_col = 'city' if 'city' in self.hotels_df.columns else 'location'
            if city_col:
                mask &= (self.hotels_df[city_col] == user_preferences['city'])
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            price_col = 'price_per_night' if 'price_per_night' in self.hotels_df.columns else ('price_per_night_inr' if 'price_per_night_inr' in self.hotels_df.columns else None)
            if price_col:
                mask &= (pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(0) >= user_preferences['budget_min_inr']) & \
                        (pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(0) <= user_preferences['budget_max_inr'])
        if 'min_star_rating' in user_preferences and 'star_rating' in self.hotels_df.columns:
            mask &= (pd.to_numeric(self.hotels_df['star_rating'], errors='coerce').fillna(0) >= user_preferences['min_star_rating'])
        # return indices that survive
        return mask[mask].index

    def recommend(self, user_preferences: Dict, top_n: int = 5):
        top_indices, similarities = self.calculate_similarity(user_preferences, top_n * 2)
        recs = []
        for idx_num, hotel_idx in enumerate(top_indices[:top_n]):
            hotel = self.hotels_df.iloc[hotel_idx]
            recs.append({
                'hotel_id': int(hotel.get('hotel_id', hotel.get('hotel_id'))),
                'hotel_name': hotel.get('hotel_name', hotel.get('hotel_name', hotel.get('name', 'Unknown'))),
                'city': hotel.get('city', hotel.get('location', 'Unknown')),
                'price_per_night': float(hotel.get('price_per_night', hotel.get('price_per_night_inr', 0))),
                'star_rating': hotel.get('star_rating', None),
                'similarity_score': float(similarities[idx_num])
            })
        return recs

# Cell 3 — Load real Kaggle input files (paths from your message)
hotels_df = pd.read_csv('/kaggle/input/hotel-master/hotels_master.csv')
processed_data = pd.read_csv('/kaggle/input/preprocessed-data/preprocessed_hotel_data_final_new.csv')
user_hotel_interactions = pd.read_csv('/kaggle/input/user-hotel-interactions/user_hotel_interactions.csv')
user_profiles = pd.read_csv('/kaggle/input/user-profiles/user_profiles.csv')

# Inspect briefly
print("hotels_df:", hotels_df.shape)
print("processed_data:", processed_data.shape)
print("user_hotel_interactions:", user_hotel_interactions.shape)
print("user_profiles:", user_profiles.shape)

# Cell 4 — Initialize content-based recommender with the PREPROCESSED file (better for amenities)
cb_recommender = ContentBasedRecommender()
# Use 'processed_data' (it has amenity_ prefixed columns)
cb_recommender.load_data(processed_data)

# Cell 5 — Train collaborative model (Surprise SVD)
reader = Reader(rating_scale=(0, 5))
# ensure rating column exists and is numeric
user_hotel_interactions['rating'] = pd.to_numeric(user_hotel_interactions['rating'], errors='coerce').fillna(0)
data = Dataset.load_from_df(user_hotel_interactions[['user_id', 'hotel_id', 'rating']], reader)
algo = SVD(n_factors=50, random_state=42)
# You can cross-validate (optional, somewhat slow)
# cross_validate(algo, data, measures=['RMSE','MAE'], cv=3, verbose=True)
trainset = data.build_full_trainset()
algo.fit(trainset)

# helper function for collaborative recommendations
hotel_mapping = processed_data[['hotel_id', 'hotel_name']].drop_duplicates()
all_hotels = processed_data['hotel_id'].unique().tolist()

def get_top_n_collab(user_id: int, n: int = 5):
    # candidate hotels: all hotels not rated by user
    rated_hotels = user_hotel_interactions[user_hotel_interactions['user_id'] == user_id]['hotel_id'].unique().tolist()
    hotels_to_predict = [h for h in all_hotels if h not in rated_hotels]
    # make predictions
    preds = [algo.predict(user_id, h) for h in hotels_to_predict]
    preds.sort(key=lambda x: x.est, reverse=True)
    top_preds = preds[:n]
    df = pd.DataFrame([(int(p.iid), float(p.est)) for p in top_preds], columns=['hotel_id', 'predicted_rating'])
    df = df.merge(hotel_mapping, on='hotel_id', how='left')
    return df

# Cell 6 — Ensemble / merging function
def ensemble_recommend(user_preferences: Dict, user_id: int = None, top_n: int = 5, weight_cf: float = 0.6, weight_cb: float = 0.4):
    """
    City is top priority:
      - If a city is provided and there are no hotels in that city, return a short message telling user to change city.
      - If city is provided and hotels exist, restrict BOTH content-based and collaborative candidates to that city.
      - If no city provided, fall back to previous behavior.
    """
    # Check for city preference and available hotels
    city_pref = user_preferences.get('city') if isinstance(user_preferences, dict) else None

    if city_pref:
        # ensure city column exists
        city_col = 'city' if 'city' in processed_data.columns else ('location' if 'location' in processed_data.columns else None)
        if city_col is None:
            # no city info available in processed_data -> fallback behavior
            allowed_hotel_ids = processed_data['hotel_id'].unique().tolist()
        else:
            allowed_hotel_ids = processed_data[processed_data[city_col] == city_pref]['hotel_id'].unique().tolist()

        if len(allowed_hotel_ids) == 0:
            # No hotels in requested city -> instruct user to change city preference
            return {"status": "no_city_matches", "message": "change your preference city"}
    else:
        # no city preference -> all hotels allowed
        allowed_hotel_ids = processed_data['hotel_id'].unique().tolist()

    # --- Content-based results (filtered to allowed_hotel_ids) ---
    cb_recs = cb_recommender.recommend(user_preferences, top_n=top_n*2)
    cb_df = pd.DataFrame(cb_recs)
    if cb_df.empty:
        cb_df = pd.DataFrame(columns=['hotel_id', 'similarity_score'])
    else:
        # filter to allowed hotels only
        cb_df = cb_df[cb_df['hotel_id'].isin(allowed_hotel_ids)].copy()
    cb_df['hotel_id'] = cb_df['hotel_id'].astype(int)

    # Normalize CB similarity
    if not cb_df.empty and cb_df['similarity_score'].max() > cb_df['similarity_score'].min():
        cb_df['cb_norm'] = (cb_df['similarity_score'] - cb_df['similarity_score'].min()) / (cb_df['similarity_score'].max() - cb_df['similarity_score'].min() + 1e-9)
    elif not cb_df.empty:
        cb_df['cb_norm'] = 1.0  # all equal similarity -> set 1
    else:
        cb_df['cb_norm'] = 0.0

    # --- Collaborative results (filtered to allowed_hotel_ids) ---
    if user_id is not None:
        cf_df = get_top_n_collab(user_id, n=top_n*5)
        if not cf_df.empty:
            cf_df = cf_df[cf_df['hotel_id'].isin(allowed_hotel_ids)].copy()
            if not cf_df.empty and cf_df['predicted_rating'].max() > cf_df['predicted_rating'].min():
                cf_df['cf_norm'] = (cf_df['predicted_rating'] - cf_df['predicted_rating'].min()) / (cf_df['predicted_rating'].max() - cf_df['predicted_rating'].min() + 1e-9)
            elif not cf_df.empty:
                cf_df['cf_norm'] = 1.0
            else:
                cf_df['cf_norm'] = 0.0
        else:
            cf_df = pd.DataFrame(columns=['hotel_id', 'predicted_rating', 'cf_norm'])
    else:
        cf_df = pd.DataFrame(columns=['hotel_id', 'predicted_rating', 'cf_norm'])

    # If both are empty after filtering, return message
    if cb_df.empty and cf_df.empty:
        return {"status": "no_matches_after_filter", "message": "change your preference city or relax other filters"}

    # Merge and compute combined score (only on hotel_id)
    merged = pd.merge(cb_df[['hotel_id','cb_norm']], cf_df[['hotel_id','cf_norm']], on='hotel_id', how='outer').fillna(0)
    merged['combined_score'] = weight_cf * merged['cf_norm'] + weight_cb * merged['cb_norm']
    merged = merged.sort_values('combined_score', ascending=False).head(top_n)

    # Enrich with hotel metadata
    merged = merged.merge(processed_data[['hotel_id','hotel_name','city','price_per_night']], on='hotel_id', how='left').drop_duplicates('hotel_id')

    # Build final response
    results = []
    for _, row in merged.iterrows():
        explanation = []
        if row.get('cf_norm', 0) > 0:
            explanation.append(f"Collaborative score contribution: {row['cf_norm']:.3f}")
        if row.get('cb_norm', 0) > 0:
            explanation.append(f"Content similarity contribution: {row['cb_norm']:.3f}")
        results.append({
            'hotel_id': int(row['hotel_id']),
            'hotel_name': row.get('hotel_name', 'Unknown'),
            'city': row.get('city', ''),
            'price_per_night': float(row.get('price_per_night')) if not pd.isna(row.get('price_per_night')) else None,
            'combined_score': float(row['combined_score']),
            'explanation': " | ".join(explanation)
        })

    return results

# Cell 7 — Wrap as LangChain Tools
def cb_tool_func(query_json: str) -> str:
    """
    Input: JSON string with user preferences (same structure as user_profiles row).
    Output: JSON string with top-5 content-based recommendations.
    """
    prefs = json.loads(query_json)
    recs = cb_recommender.recommend(prefs, top_n=5)
    return json.dumps(recs, default=str)

def cf_tool_func(query_json: str) -> str:
    """
    Input: JSON string with {'user_id': <int>, 'n': 5}
    Output: JSON string with top-n collaborative recommendations
    """
    payload = json.loads(query_json)
    uid = int(payload.get('user_id'))
    n = int(payload.get('n', 5))
    df = get_top_n_collab(uid, n=n)
    return df.to_json(orient='records')

def ensemble_tool_func(query_json: str) -> str:
    """
    Input JSON: {'user_id': <int or null>, 'user_preferences': {...}, 'top_n': 5}
    Output: JSON array with final recommendations
    """
    payload = json.loads(query_json)
    uid = payload.get('user_id')  # can be null
    user_prefs = payload.get('user_preferences', {})
    top_n = int(payload.get('top_n', 5))
    results = ensemble_recommend(user_prefs, user_id=uid, top_n=top_n)
    return json.dumps(results, default=str)

# Create LangChain Tool objects
cb_tool = Tool(
    name="content_based_recommender",
    func=cb_tool_func,
    description="Given a JSON-encoded user preferences object, returns top content-based hotel recommendations."
)

cf_tool = Tool(
    name="collaborative_recommender",
    func=cf_tool_func,
    description="Given a JSON string with user_id and n, returns top-n collaborative recommendations."
)

ensemble_tool = Tool(
    name="ensemble_recommender",
    func=ensemble_tool_func,
    description="Given a JSON payload containing user_id (optional) and user_preferences, returns merged recommendations."
)

# Cell 8 — Build LangChain Agent (requires GEMINI_API_KEY)
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', None)
if GEMINI_API_KEY is None:
    print("GEMINI_API_KEY not set in environment. To use the LangChain agent, set GEMINI_API_KEY in Kaggle secrets.")
else:
    from langchain_google_genai import ChatGoogleGenerativeAI
    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, google_api_key=GEMINI_API_KEY)

    tools = [cb_tool, cf_tool, ensemble_tool]
    # initialize a zero-shot agent (tool-using)
    agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)

    # Example prompt you can give to the agent (string). The agent will choose tools as needed.
    example_query = (
        "Recommend 5 hotels for user with user_id=1. "
        "The user preferences: city='Bangalore', preferred_amenities=['pool','gym'], budget_min_inr=1000, budget_max_inr=10000, min_star_rating=4. "
        "Please use collaborative info where possible and combine with content-based, provide short explanations."
    )
    # The agent expects natural language; we embed a JSON payload where needed. Example:
    agent_input = (
        "Use ensemble_recommender tool with this JSON payload: "
        + json.dumps({
            "user_id": 1,
            "user_preferences": {
                "city": "Bangalore",
                "preferred_amenities": ["pool", "gym"],
                "budget_min_inr": 1000,
                "budget_max_inr": 10000,
                "min_star_rating": 4
            },
            "top_n": 5
        })
    )
    # Run agent (if GEMINI_API_KEY present)
    resp = agent.run(agent_input)
    print(resp)

# Paste this entire block to replace your previous generate_llm_explanations + deterministic_explanation
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
from google.api_core.exceptions import ResourceExhausted, NotFound
import traceback

def deterministic_explanation(r, user_prefs):
    """Simple rule-based explanation when LLM is unavailable."""
    parts = []
    # city match
    if user_prefs.get('city') and r.get('city') == user_prefs.get('city'):
        parts.append(f"Located in your chosen city {r.get('city')}.")
    # price
    price = r.get('price_per_night')
    if price is not None and 'budget_min_inr' in user_prefs and 'budget_max_inr' in user_prefs:
        try:
            if user_prefs['budget_min_inr'] <= price <= user_prefs['budget_max_inr']:
                parts.append(f"Priced within your budget at ₹{int(price):,} per night.")
            elif price < user_prefs['budget_min_inr']:
                parts.append(f"Below your budget at ₹{int(price):,} per night.")
            else:
                parts.append(f"Higher than your stated max budget at ₹{int(price):,}.")
        except Exception:
            pass
    # star rating
    if user_prefs.get('min_star_rating') and r.get('star_rating') is not None:
        try:
            if float(r.get('star_rating')) >= float(user_prefs['min_star_rating']):
                parts.append(f"Meets your star rating requirement ({r.get('star_rating')}★).")
        except Exception:
            pass
    # scores
    if r.get('combined_score') is not None:
        parts.append(f"Combined score {r.get('combined_score'):.2f}.")
    # fallback
    if not parts:
        parts = ["Recommended based on collaborative signals and content similarity."]
    explanation = " ".join(parts)
    # Keep it brief (one or two sentences)
    if len(explanation) > 240:
        explanation = explanation[:237] + "..."
    return explanation

def _extract_text_from_llm_response(resp):
    """
    Normalize a variety of possible LangChain/LLM response shapes to extract text:
    - If resp is a string -> return it
    - If resp has attribute 'content' -> return resp.content
    - If resp has attribute 'generations' -> grab first generation text
    - If resp is a dict/list -> try to search common fields
    """
    try:
        if resp is None:
            return ""
        # plain string
        if isinstance(resp, str):
            return resp
        # object with content attribute (ChatMessage-like)
        if hasattr(resp, "content"):
            return str(resp.content)
        # LLMResult in some LangChain versions: resp.generations -> list[list[Generation]]
        if hasattr(resp, "generations"):
            gens = resp.generations
            # gens might be list of lists
            if isinstance(gens, list) and len(gens) > 0:
                first = gens[0]
                # first can be list or Generation
                if isinstance(first, list) and len(first) > 0:
                    g0 = first[0]
                    # Generation has .text or .generation
                    if hasattr(g0, "text"):
                        return str(g0.text)
                    elif hasattr(g0, "generation"):
                        return str(g0.generation)
                else:
                    g0 = first
                    if hasattr(g0, "text"):
                        return str(g0.text)
                    elif hasattr(g0, "generation"):
                        return str(g0.generation)
        # dict-like
        if isinstance(resp, dict):
            # try common keys
            for k in ("text", "output", "content", "response"):
                if k in resp and resp[k]:
                    return str(resp[k])
            # nested possibilities
            if "choices" in resp and isinstance(resp["choices"], list) and len(resp["choices"]) > 0:
                ch = resp["choices"][0]
                if isinstance(ch, dict):
                    for k in ("text", "message", "content"):
                        if k in ch:
                            return str(ch[k])
        # list-like: join texts
        if isinstance(resp, list):
            texts = []
            for item in resp:
                try:
                    texts.append(_extract_text_from_llm_response(item))
                except Exception:
                    continue
            return " ".join([t for t in texts if t])
        # lastly, fallback to string conversion
        return str(resp)
    except Exception:
        return str(resp)

def generate_llm_explanations(recs, user_prefs, model_name="gemini-2.0-flash", max_tokens=150):
    """
    Generate LLM explanations for each recommendation in `recs`.
    - recs: list of dicts from ensemble_recommend (each must contain 'hotel_id', 'hotel_name', etc.)
    - user_prefs: dict used to compute recommendations
    - model_name: exact Gemini model name (from your model listing). Default uses gemini-2.0-flash.
    Returns: new list with 'llm_explanation' field added for each rec (falls back deterministically on errors).
    """
    gemini_key = os.environ.get("GEMINI_API_KEY")
    if not gemini_key:
        # No key: attach deterministic explanations and return
        for r in recs:
            r['llm_explanation'] = deterministic_explanation(r, user_prefs)
        return recs

    # instantiate LLM
    try:
        llm = ChatGoogleGenerativeAI(model=model_name, temperature=0.0, max_output_tokens=max_tokens, google_api_key=gemini_key)
    except Exception as e:
        # if initialization fails, fallback deterministically
        print("LLM init error - falling back to deterministic explanations:", e)
        for r in recs:
            r['llm_explanation'] = deterministic_explanation(r, user_prefs)
        return recs

    enriched = []
    for r in recs:
        try:
            # Locate hotel metadata row if available
            hid = None
            try:
                hid = int(r.get('hotel_id'))
            except Exception:
                hid = None

            hotel_row = processed_data[processed_data['hotel_id'] == hid] if hid is not None else pd.DataFrame()

            # Build facts (compact)
            facts = {}
            if not hotel_row.empty:
                hr = hotel_row.iloc[0]
                facts['hotel_name'] = hr.get('hotel_name') if 'hotel_name' in hr.index else hr.get('name', r.get('hotel_name'))
                facts['city'] = hr.get('city', hr.get('location', r.get('city')))
                # attempt to get reasonable price
                for pc in ['price_per_night', 'price_per_night_inr', 'price_per_night_inr']:
                    if pc in hr.index and not pd.isna(hr.get(pc)):
                        try:
                            facts['price'] = int(hr.get(pc))
                            break
                        except Exception:
                            pass
                facts['star_rating'] = hr.get('star_rating') if 'star_rating' in hr.index else hr.get('star_rating_clean', r.get('star_rating'))
                # amenities
                amenity_cols = [c for c in processed_data.columns if str(c).startswith('amenity_')]
                present_amenities = []
                for ac in amenity_cols:
                    try:
                        if int(hr.get(ac, 0)) == 1:
                            present_amenities.append(ac.replace('amenity_','').replace('_',' '))
                    except Exception:
                        pass
                facts['amenities'] = present_amenities[:6]
            else:
                facts['hotel_name'] = r.get('hotel_name')
                facts['city'] = r.get('city')
                facts['price'] = r.get('price_per_night')
                facts['star_rating'] = r.get('star_rating')
                facts['amenities'] = []

            # ensemble scores
            cb_score = r.get('similarity_score') or r.get('cb_norm') or None
            cf_score = r.get('predicted_rating') or r.get('cf_norm') or None
            combined = r.get('combined_score')

            # Build prompt
            prompt_lines = [
                "You are a helpful recommender-system assistant. Produce a concise (2-3 sentences) explanation — no lists — explaining WHY this hotel is recommended to the user. Mention the user's key preferences and any tradeoffs (price, rating, amenities).",
                "",
                f"User preferences: {json.dumps(user_prefs)}",
                "",
                "Hotel facts:",
                f"  name: {facts.get('hotel_name')}",
                f"  city: {facts.get('city')}",
            ]
            if facts.get('price') is not None:
                prompt_lines.append(f"  price_per_night: ₹{facts.get('price'):,}")
            if facts.get('star_rating') is not None:
                prompt_lines.append(f"  star_rating: {facts.get('star_rating')}")
            if facts.get('amenities'):
                prompt_lines.append(f"  amenities (sample): {', '.join(facts.get('amenities'))}")
            if cb_score is not None:
                prompt_lines.append(f"  content_similarity_score: {cb_score}")
            if cf_score is not None:
                prompt_lines.append(f"  collaborative_score: {cf_score}")
            if combined is not None:
                try:
                    prompt_lines.append(f"  combined_score: {combined:.3f}")
                except Exception:
                    pass

            prompt_lines.append("")
            prompt_lines.append("Write the explanation now:")

            prompt = "\n".join(prompt_lines)

            # call Gemini correctly using a HumanMessage and invoke
            human_msg = HumanMessage(content=prompt)
            resp = llm.invoke([human_msg])

            text = _extract_text_from_llm_response(resp)
            text = text.strip().replace("\n", " ")
            # safety: if LLM outputs empty, fallback to deterministic
            if not text:
                text = deterministic_explanation(r, user_prefs)

            r['llm_explanation'] = text
            enriched.append(r)

        except ResourceExhausted as e:
            # quota problem -> fallback deterministic
            r['llm_explanation'] = deterministic_explanation(r, user_prefs)
            enriched.append(r)
        except NotFound as e:
            # model not found or method not supported -> fallback deterministic
            r['llm_explanation'] = deterministic_explanation(r, user_prefs)
            enriched.append(r)
        except Exception as e:
            # Any other error: deterministic fallback + debug print
            r['llm_explanation'] = deterministic_explanation(r, user_prefs)
            enriched.append(r)
            print(f"LLM explain error for hotel_id {r.get('hotel_id')}: {e}")
            traceback.print_exc()

    return enriched

# Example usage (uncomment to run):
example_user_prefs = {
     "city": "Bangalore",
     "preferred_amenities": ["pool", "gym"],
     "budget_min_inr": 1000,
     "budget_max_inr": 10000,
     "min_star_rating": 4
 }
ensemble_results = ensemble_recommend(example_user_prefs, user_id=1, top_n=5)
enriched_results = generate_llm_explanations(ensemble_results, example_user_prefs, model_name="gemini-1.5-pro-002", max_tokens=120)
print(json.dumps(enriched_results, indent=2, ensure_ascii=False))



# Cell 9 — Example direct calls without LLM (recommended for testing quickly)
# Get content-based recommendations for a sample user profile (from user_profiles)
sample_user = user_profiles.iloc[0].to_dict()
# user_profiles has preferred_amenities as string e.g. '["pool", "family_rooms", "parking"]'
print("\nContent-based recs (direct):")
print(cb_tool_func(json.dumps({
    "city": sample_user.get('home_city', sample_user.get('city', None)),
    "preferred_amenities": sample_user.get('preferred_amenities'),
    "budget_min_inr": int(sample_user.get('budget_min_inr', 0)),
    "budget_max_inr": int(sample_user.get('budget_max_inr', 999999)),
    "min_star_rating": int(sample_user.get('min_star_rating', 0)) if sample_user.get('min_star_rating') else None
})))

print("\nCollaborative recs (direct) for user_id=1:")
print(cf_tool_func(json.dumps({"user_id": 1, "n": 5})))

print("\nEnsemble recs (direct):")
print(ensemble_tool_func(json.dumps({
    "user_id": 1,
    "user_preferences": {
        "city": "Bangalore",
        "preferred_amenities": ["pool", "gym"],
        "budget_min_inr": 1000,
        "budget_max_inr": 10000,
        "min_star_rating": 4
    },
    "top_n": 5
})))

hotels_df: (400, 27)
processed_data: (4632, 39)
user_hotel_interactions: (1288, 11)
user_profiles: (300, 16)


[1m> Entering new AgentExecutor chain...[0m


  agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
  resp = agent.run(agent_input)


[32;1m[1;3mI should use the ensemble_recommender tool to get the merged hotel recommendations based on the provided user ID and preferences.
Action: ensemble_recommender
Action Input: {"user_id": 1, "user_preferences": {"city": "Bangalore", "preferred_amenities": ["pool", "gym"], "budget_min_inr": 1000, "budget_max_inr": 10000, "min_star_rating": 4}, "top_n": 5}[0m
Observation: [38;5;200m[1;3m[{"hotel_id": 298, "hotel_name": "Fairfield By Marriott Bengaluru Rajajinagar", "city": "Bangalore", "price_per_night": 5400.0, "combined_score": 0.6027283963837166, "explanation": "Collaborative score contribution: 1.000 | Content similarity contribution: 0.007"}, {"hotel_id": 1338, "hotel_name": "The Leela Bhartiya City", "city": "Bangalore", "price_per_night": 4300.0, "combined_score": 0.3999999971671504, "explanation": "Content similarity contribution: 1.000"}, {"hotel_id": 1607, "hotel_name": "Radisson Bengaluru City Center", "city": "Bangalore", "price_per_night": 7500.0, "combined_scor

In [8]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
# Note: we don't import OpenAI here when using Gemini; keep for compatibility if needed
from langchain.llms import OpenAI
from langchain import LLMChain, PromptTemplate
from typing import List, Dict, Tuple

# -------------------------
# Content-based recommender
# -------------------------
class ContentBasedRecommender:
    def __init__(self):
        self.hotels_df = None
        self.feature_weights = {
            'amenities': 0.35,
            'location': 0.25,
            'price': 0.20,
            'star_rating': 0.15,
            'property_type': 0.05
        }
        self.amenity_features = [
            'amenity_free_wifi', 'amenity_air_conditioning', 'amenity_parking', 'amenity_room_service',
            'amenity_24_7_front_desk', 'amenity_restaurant', 'amenity_gym', 'amenity_pool', 'amenity_spa',
            'amenity_business_center', 'amenity_conference_hall', 'amenity_airport_shuttle',
            'amenity_meeting_rooms', 'amenity_vegetarian_restaurant'
        ]
        self.scaler = MinMaxScaler()

    def load_data(self, hotels_df: pd.DataFrame):
        self.hotels_df = hotels_df.copy()
        # ensure amenity columns present and integer
        for amenity in self.amenity_features:
            if amenity in self.hotels_df.columns:
                self.hotels_df[amenity] = self.hotels_df[amenity].fillna(0).astype(int)
            else:
                self.hotels_df[amenity] = 0

        # city/location dummies
        city_col = 'city' if 'city' in self.hotels_df.columns else ('location' if 'location' in self.hotels_df.columns else None)
        if city_col is not None:
            location_dummies = pd.get_dummies(self.hotels_df[city_col].fillna('Unknown'), prefix='city')
            self.hotels_df = pd.concat([self.hotels_df, location_dummies], axis=1)
        else:
            location_dummies = pd.DataFrame(index=self.hotels_df.index)

        # price normalization
        price_col = 'price_per_night' if 'price_per_night' in self.hotels_df.columns else ('price_per_night_inr' if 'price_per_night_inr' in self.hotels_df.columns else 'price_per_night_inr')
        rating_col = 'star_rating' if 'star_rating' in self.hotels_df.columns else ('star_rating_clean' if 'star_rating_clean' in self.hotels_df.columns else None)

        if price_col in self.hotels_df.columns:
            self.hotels_df[price_col] = pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(self.hotels_df[price_col].median())
            self.hotels_df['price_normalized'] = self.scaler.fit_transform(self.hotels_df[[price_col]])
        else:
            self.hotels_df['price_normalized'] = 0.0

        if rating_col:
            self.hotels_df[rating_col] = pd.to_numeric(self.hotels_df[rating_col], errors='coerce').fillna(self.hotels_df[rating_col].median())
            self.hotels_df['rating_normalized'] = self.scaler.fit_transform(self.hotels_df[[rating_col]])
        else:
            self.hotels_df['rating_normalized'] = 0.0

        # property type dummies
        property_col = 'hotel_type' if 'hotel_type' in self.hotels_df.columns else ('property_type' if 'property_type' in self.hotels_df.columns else None)
        if property_col:
            property_dummies = pd.get_dummies(self.hotels_df[property_col].fillna('Unknown'), prefix='property')
            self.hotels_df = pd.concat([self.hotels_df, property_dummies], axis=1)
        else:
            property_dummies = pd.DataFrame(index=self.hotels_df.index)

        self.feature_columns = (
            self.amenity_features +
            list(location_dummies.columns) +
            ['price_normalized', 'rating_normalized'] +
            list(property_dummies.columns)
        )

        for col in self.feature_columns:
            if col not in self.hotels_df.columns:
                self.hotels_df[col] = 0

        self.hotels_df['feature_vector'] = self.hotels_df[self.feature_columns].values.tolist()
        return self

    def _create_user_profile(self, user_preferences: Dict):
        user_vector = np.zeros(len(self.feature_columns))
        feature_idx_map = {feature: idx for idx, feature in enumerate(self.feature_columns)}

        # city
        if 'city' in user_preferences:
            city_feature = f"city_{user_preferences['city']}"
            if city_feature in feature_idx_map:
                user_vector[feature_idx_map[city_feature]] = 1.0

        # preferred amenities
        if 'preferred_amenities' in user_preferences and user_preferences['preferred_amenities']:
            pref = user_preferences['preferred_amenities']
            if isinstance(pref, str):
                try:
                    preferred_amenities = json.loads(pref)
                except Exception:
                    preferred_amenities = [pref]
            else:
                preferred_amenities = pref
            for amenity in preferred_amenities:
                col_name = amenity if amenity.startswith('amenity_') else f'amenity_{amenity}'
                if col_name in feature_idx_map:
                    user_vector[feature_idx_map[col_name]] = 1.0

        # budget influence
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            price_norm_idx = feature_idx_map.get('price_normalized')
            if price_norm_idx is not None:
                user_vector[price_norm_idx] = 1.0

        return user_vector

    def calculate_similarity(self, user_preferences: Dict, top_n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
        if self.hotels_df is None:
            raise ValueError("Load data first")
        user_vector = self._create_user_profile(user_preferences)
        hotel_vectors = np.array(self.hotels_df['feature_vector'].tolist())
        sims = cosine_similarity([user_vector], hotel_vectors)[0]
        filtered_indices = self._apply_filters(user_preferences)
        filtered_sims = sims[filtered_indices]
        top_indices = filtered_indices[np.argsort(filtered_sims)[-top_n:][::-1]]
        return top_indices, sims[top_indices]

    def _apply_filters(self, user_preferences: Dict):
        mask = pd.Series(True, index=self.hotels_df.index)
        if 'city' in user_preferences:
            city_col = 'city' if 'city' in self.hotels_df.columns else 'location'
            if city_col:
                mask &= (self.hotels_df[city_col] == user_preferences['city'])
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            price_col = 'price_per_night' if 'price_per_night' in self.hotels_df.columns else ('price_per_night_inr' if 'price_per_night_inr' in self.hotels_df.columns else None)
            if price_col:
                mask &= (pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(0) >= user_preferences['budget_min_inr']) & \
                        (pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(0) <= user_preferences['budget_max_inr'])
        if 'min_star_rating' in user_preferences and 'star_rating' in self.hotels_df.columns:
            mask &= (pd.to_numeric(self.hotels_df['star_rating'], errors='coerce').fillna(0) >= user_preferences['min_star_rating'])
        return mask[mask].index

    def recommend(self, user_preferences: Dict, top_n: int = 5):
        top_indices, similarities = self.calculate_similarity(user_preferences, top_n * 2)
        recs = []
        for idx_num, hotel_idx in enumerate(top_indices[:top_n]):
            hotel = self.hotels_df.iloc[hotel_idx]
            recs.append({
                'hotel_id': int(hotel.get('hotel_id', hotel.get('hotel_id'))),
                'hotel_name': hotel.get('hotel_name', hotel.get('hotel_name', hotel.get('name', 'Unknown'))),
                'city': hotel.get('city', hotel.get('location', 'Unknown')),
                'price_per_night': float(hotel.get('price_per_night', hotel.get('price_per_night_inr', 0))),
                'star_rating': hotel.get('star_rating', None),
                'similarity_score': float(similarities[idx_num])
            })
        return recs

# -------------------------
# Load data
# -------------------------
hotels_df = pd.read_csv('/kaggle/input/hotel-master/hotels_master.csv')
processed_data = pd.read_csv('/kaggle/input/preprocessed-data/preprocessed_hotel_data_final_new.csv')
user_hotel_interactions = pd.read_csv('/kaggle/input/user-hotel-interactions/user_hotel_interactions.csv')
user_profiles = pd.read_csv('/kaggle/input/user-profiles/user_profiles.csv')

print("hotels_df:", hotels_df.shape)
print("processed_data:", processed_data.shape)
print("user_hotel_interactions:", user_hotel_interactions.shape)
print("user_profiles:", user_profiles.shape)

# initialize CB
cb_recommender = ContentBasedRecommender()
cb_recommender.load_data(processed_data)

# -------------------------
# Collaborative (Surprise SVD)
# -------------------------
reader = Reader(rating_scale=(0, 5))
user_hotel_interactions['rating'] = pd.to_numeric(user_hotel_interactions['rating'], errors='coerce').fillna(0)
data = Dataset.load_from_df(user_hotel_interactions[['user_id', 'hotel_id', 'rating']], reader)
algo = SVD(n_factors=50, random_state=42)
trainset = data.build_full_trainset()
algo.fit(trainset)

hotel_mapping = processed_data[['hotel_id', 'hotel_name']].drop_duplicates()
all_hotels = processed_data['hotel_id'].unique().tolist()

def get_top_n_collab(user_id: int, n: int = 5):
    rated_hotels = user_hotel_interactions[user_hotel_interactions['user_id'] == user_id]['hotel_id'].unique().tolist()
    hotels_to_predict = [h for h in all_hotels if h not in rated_hotels]
    preds = [algo.predict(user_id, h) for h in hotels_to_predict]
    preds.sort(key=lambda x: x.est, reverse=True)
    top_preds = preds[:n]
    df = pd.DataFrame([(int(p.iid), float(p.est)) for p in top_preds], columns=['hotel_id', 'predicted_rating'])
    df = df.merge(hotel_mapping, on='hotel_id', how='left')
    return df

# -------------------------
# Ensemble recommender (city-first)
# -------------------------
def ensemble_recommend(user_preferences: Dict, user_id: int = None, top_n: int = 5, weight_cf: float = 0.6, weight_cb: float = 0.4):
    # ensure user_preferences is dict
    user_preferences = user_preferences or {}
    city_pref = user_preferences.get('city')

    # determine city column
    city_col = 'city' if 'city' in processed_data.columns else ('location' if 'location' in processed_data.columns else None)
    if city_pref:
        if city_col is None:
            allowed_hotel_ids = processed_data['hotel_id'].unique().tolist()
        else:
            allowed_hotel_ids = processed_data[processed_data[city_col] == city_pref]['hotel_id'].unique().tolist()
        if len(allowed_hotel_ids) == 0:
            return {"status": "no_city_matches", "message": "change your preference city"}
    else:
        allowed_hotel_ids = processed_data['hotel_id'].unique().tolist()

    # content-based
    cb_recs = cb_recommender.recommend(user_preferences, top_n=top_n*2)
    cb_df = pd.DataFrame(cb_recs)
    if cb_df.empty:
        cb_df = pd.DataFrame(columns=['hotel_id', 'similarity_score'])
    else:
        cb_df = cb_df[cb_df['hotel_id'].isin(allowed_hotel_ids)].copy()
    if 'hotel_id' in cb_df.columns:
        cb_df['hotel_id'] = cb_df['hotel_id'].astype(int)

    if not cb_df.empty and cb_df['similarity_score'].max() > cb_df['similarity_score'].min():
        cb_df['cb_norm'] = (cb_df['similarity_score'] - cb_df['similarity_score'].min()) / (cb_df['similarity_score'].max() - cb_df['similarity_score'].min() + 1e-9)
    elif not cb_df.empty:
        cb_df['cb_norm'] = 1.0
    else:
        cb_df['cb_norm'] = 0.0

    # collaborative
    if user_id is not None:
        cf_df = get_top_n_collab(user_id, n=top_n*5)
        if not cf_df.empty:
            cf_df = cf_df[cf_df['hotel_id'].isin(allowed_hotel_ids)].copy()
            if not cf_df.empty and cf_df['predicted_rating'].max() > cf_df['predicted_rating'].min():
                cf_df['cf_norm'] = (cf_df['predicted_rating'] - cf_df['predicted_rating'].min()) / (cf_df['predicted_rating'].max() - cf_df['predicted_rating'].min() + 1e-9)
            elif not cf_df.empty:
                cf_df['cf_norm'] = 1.0
            else:
                cf_df['cf_norm'] = 0.0
        else:
            cf_df = pd.DataFrame(columns=['hotel_id', 'predicted_rating', 'cf_norm'])
    else:
        cf_df = pd.DataFrame(columns=['hotel_id', 'predicted_rating', 'cf_norm'])

    if cb_df.empty and cf_df.empty:
        return {"status": "no_matches_after_filter", "message": "change your preference city or relax other filters"}

    merged = pd.merge(cb_df[['hotel_id','cb_norm']], cf_df[['hotel_id','cf_norm']], on='hotel_id', how='outer').fillna(0)
    merged['combined_score'] = weight_cf * merged['cf_norm'] + weight_cb * merged['cb_norm']
    merged = merged.sort_values('combined_score', ascending=False).head(top_n)
    merged = merged.merge(processed_data[['hotel_id','hotel_name','city','price_per_night','star_rating']], on='hotel_id', how='left').drop_duplicates('hotel_id')

    results = []
    for _, row in merged.iterrows():
        explanation = []
        if row.get('cf_norm', 0) > 0:
            explanation.append(f"Collaborative score contribution: {row['cf_norm']:.3f}")
        if row.get('cb_norm', 0) > 0:
            explanation.append(f"Content similarity contribution: {row['cb_norm']:.3f}")
        results.append({
            'hotel_id': int(row['hotel_id']),
            'hotel_name': row.get('hotel_name', 'Unknown'),
            'city': row.get('city', ''),
            'price_per_night': float(row.get('price_per_night')) if not pd.isna(row.get('price_per_night')) else None,
            'star_rating': row.get('star_rating', None),
            'combined_score': float(row['combined_score']),
            'explanation': " | ".join(explanation)
        })
    return results

# -------------------------
# LangChain Tool wrappers
# -------------------------
def cb_tool_func(query_json: str) -> str:
    prefs = json.loads(query_json)
    recs = cb_recommender.recommend(prefs, top_n=5)
    return json.dumps(recs, default=str)

def cf_tool_func(query_json: str) -> str:
    payload = json.loads(query_json)
    uid = int(payload.get('user_id'))
    n = int(payload.get('n', 5))
    df = get_top_n_collab(uid, n=n)
    return df.to_json(orient='records')

def ensemble_tool_func(query_json: str) -> str:
    payload = json.loads(query_json)
    uid = payload.get('user_id')
    user_prefs = payload.get('user_preferences', {})
    top_n = int(payload.get('top_n', 5))
    results = ensemble_recommend(user_prefs, user_id=uid, top_n=top_n)
    return json.dumps(results, default=str)

cb_tool = Tool(
    name="content_based_recommender",
    func=cb_tool_func,
    description="Given a JSON-encoded user preferences object, returns top content-based hotel recommendations."
)
cf_tool = Tool(
    name="collaborative_recommender",
    func=cf_tool_func,
    description="Given a JSON string with user_id and n, returns top-n collaborative recommendations."
)
ensemble_tool = Tool(
    name="ensemble_recommender",
    func=ensemble_tool_func,
    description="Given a JSON payload containing user_id (optional) and user_preferences, returns merged recommendations."
)

# -------------------------
# LLM explanation utilities (no deterministic fallback)
# -------------------------
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
from google.api_core.exceptions import ResourceExhausted, NotFound
import traceback

def _extract_text_from_llm_response(resp):
    """
    Normalize a variety of possible LangChain/LLM response shapes to extract text:
    - If resp is a string -> return it
    - If resp has attribute 'content' -> return resp.content
    - If resp has attribute 'generations' -> grab first generation text
    - If resp is a dict/list -> try to search common fields
    """
    try:
        if resp is None:
            return ""
        # plain string
        if isinstance(resp, str):
            return resp
        # object with content attribute (ChatMessage-like)
        if hasattr(resp, "content"):
            return str(resp.content)
        # LLMResult in some LangChain versions: resp.generations -> list[list[Generation]]
        if hasattr(resp, "generations"):
            gens = resp.generations
            if isinstance(gens, list) and len(gens) > 0:
                first = gens[0]
                # first can be list or Generation
                if isinstance(first, list) and len(first) > 0:
                    g0 = first[0]
                    if hasattr(g0, "text"):
                        return str(g0.text)
                    elif hasattr(g0, "generation"):
                        return str(g0.generation)
                else:
                    g0 = first
                    if hasattr(g0, "text"):
                        return str(g0.text)
                    elif hasattr(g0, "generation"):
                        return str(g0.generation)
        # dict-like
        if isinstance(resp, dict):
            # try common keys
            for k in ("text", "output", "content", "response"):
                if k in resp and resp[k]:
                    return str(resp[k])
            # nested possibilities
            if "choices" in resp and isinstance(resp["choices"], list) and len(resp["choices"]) > 0:
                ch = resp["choices"][0]
                if isinstance(ch, dict):
                    for k in ("text", "message", "content"):
                        if k in ch and ch[k]:
                            return str(ch[k])
        # list-like: join texts
        if isinstance(resp, list):
            texts = []
            for item in resp:
                try:
                    texts.append(_extract_text_from_llm_response(item))
                except Exception:
                    continue
            return " ".join([t for t in texts if t])
        # lastly, fallback to string conversion
        return str(resp)
    except Exception:
        return str(resp)


import time
import random
from google.api_core.exceptions import ResourceExhausted, NotFound

def generate_llm_explanations_with_retry(recs, user_prefs, model_name="gemini-1.5-flash", max_tokens=120, max_retries=3):
    """
    Generate LLM explanations with retry logic for rate limiting.
    """
    gemini_key = os.environ.get("GEMINI_API_KEY")
    if not gemini_key:
        for r in recs:
            r['llm_explanation'] = ""
            r['llm_error'] = "GEMINI_API_KEY not set"
        return recs

    try:
        llm = ChatGoogleGenerativeAI(
            model=model_name, 
            temperature=0.0, 
            max_output_tokens=max_tokens, 
            google_api_key=gemini_key
        )
    except Exception as e:
        for r in recs:
            r['llm_explanation'] = ""
            r['llm_error'] = f"LLM init error: {str(e)}"
        return recs

    enriched = []
    
    for i, r in enumerate(recs):
        r['llm_explanation'] = ""
        r.pop('llm_error', None)
        
        # Add delay between requests to avoid rate limiting
        if i > 0:
            time.sleep(12)  # 12 second delay between requests
        
        retry_count = 0
        while retry_count < max_retries:
            try:
                # Prepare hotel facts (same logic as before)
                hid = None
                try:
                    hid = int(r.get('hotel_id'))
                except Exception:
                    hid = None

                hotel_row = processed_data[processed_data['hotel_id'] == hid] if hid is not None else pd.DataFrame()
                facts = {}
                
                if not hotel_row.empty:
                    hr = hotel_row.iloc[0]
                    facts['hotel_name'] = hr.get('hotel_name', r.get('hotel_name'))
                    facts['city'] = hr.get('city', hr.get('location', r.get('city')))
                    
                    # Price handling
                    facts['price'] = None
                    for pc in ['price_per_night', 'price_per_night_inr']:
                        if pc in hr.index and not pd.isna(hr.get(pc)):
                            try:
                                facts['price'] = int(hr.get(pc))
                                break
                            except Exception:
                                pass
                    
                    facts['star_rating'] = hr.get('star_rating', hr.get('star_rating_clean', r.get('star_rating')))
                    
                    # Amenities
                    amenity_cols = [c for c in processed_data.columns if str(c).startswith('amenity_')]
                    present_amenities = []
                    for ac in amenity_cols:
                        try:
                            if int(hr.get(ac, 0)) == 1:
                                present_amenities.append(ac.replace('amenity_','').replace('_',' '))
                        except Exception:
                            pass
                    facts['amenities'] = present_amenities[:4]  # Limit to 4 amenities to reduce token usage
                else:
                    facts['hotel_name'] = r.get('hotel_name')
                    facts['city'] = r.get('city')
                    facts['price'] = r.get('price_per_night')
                    facts['star_rating'] = r.get('star_rating')
                    facts['amenities'] = []

                # Build shorter prompt to reduce token usage
                prompt_lines = [
                    "Explain in 2 sentences why this hotel matches the user's preferences:",
                    f"User wants: {user_prefs.get('city', 'Any city')}, budget ₹{user_prefs.get('budget_min_inr', 0)}-₹{user_prefs.get('budget_max_inr', 'unlimited')}, {user_prefs.get('min_star_rating', 'any')} stars",
                    f"Hotel: {facts.get('hotel_name')} in {facts.get('city')}, ₹{facts.get('price')}/night, {facts.get('star_rating')} stars"
                ]
                
                if facts.get('amenities'):
                    prompt_lines.append(f"Amenities: {', '.join(facts.get('amenities')[:3])}")
                
                prompt = "\n".join(prompt_lines)

                # Make the API call
                human_msg = HumanMessage(content=prompt)
                resp = llm.invoke([human_msg])
                
                text = _extract_text_from_llm_response(resp)
                text = (text or "").strip().replace("\n", " ")
                
                if text:
                    r['llm_explanation'] = text
                    break  # Success, exit retry loop
                else:
                    r['llm_explanation'] = ""
                    r['llm_error'] = "LLM returned empty response"
                    break
                    
            except ResourceExhausted as e:
                retry_count += 1
                if retry_count < max_retries:
                    # Extract retry delay from error message or use exponential backoff
                    wait_time = min(60, 5 * (2 ** retry_count) + random.uniform(0, 1))
                    print(f"Rate limited for hotel {r.get('hotel_id')}, waiting {wait_time:.1f}s (attempt {retry_count}/{max_retries})")
                    time.sleep(wait_time)
                else:
                    r['llm_explanation'] = ""
                    r['llm_error'] = "Rate limit exceeded after retries"
                    
            except Exception as e:
                r['llm_explanation'] = ""
                r['llm_error'] = f"LLM error: {str(e)[:100]}"  # Truncate long error messages
                break  # Don't retry for other errors
        
        enriched.append(r)
    
    return enriched

# -------------------------
# Example run (end-to-end)
# -------------------------
if __name__ == "__main__":
    example_user_prefs = {
        "city": "Kolkata",
        "preferred_amenities": ["pool", "gym"],
        "budget_min_inr": 1000,
        "budget_max_inr": 10000,
        "min_star_rating": 4
    }

    # direct ensemble call
    ensemble_results = ensemble_recommend(example_user_prefs, user_id=1, top_n=5)
    print("Ensemble results (raw):")
    print(json.dumps(ensemble_results, indent=2, ensure_ascii=False))

    # generate explanations (LLM if GEMINI_API_KEY present, otherwise deterministic)
    # Instead of generate_llm_explanations, use:
    enriched_results = generate_llm_explanations_with_retry(
                    ensemble_results, 
                    example_user_prefs, 
                    model_name="gemini-1.5-flash",  # Lighter model
                    max_tokens=80  # Reduced token limit
    )
    print("\nEnriched results with explanations:")
    print(json.dumps(enriched_results, indent=2, ensure_ascii=False))


hotels_df: (400, 27)
processed_data: (4632, 39)
user_hotel_interactions: (1288, 11)
user_profiles: (300, 16)
Ensemble results (raw):
[
  {
    "hotel_id": 195,
    "hotel_name": "Dover Inn By BookMeriHotel",
    "city": "Kolkata",
    "price_per_night": 4300.0,
    "star_rating": -1.0,
    "combined_score": 0.5999999957348829,
    "explanation": "Collaborative score contribution: 1.000"
  },
  {
    "hotel_id": 3651,
    "hotel_name": "Kenilworth Hotel",
    "city": "Kolkata",
    "price_per_night": 6054.0,
    "star_rating": 4.0,
    "combined_score": 0.39999993296365943,
    "explanation": "Content similarity contribution: 1.000"
  },
  {
    "hotel_id": 416,
    "hotel_name": "The Sonnet",
    "city": "Kolkata",
    "price_per_night": 4937.0,
    "star_rating": 4.0,
    "combined_score": 0.2198838038566475,
    "explanation": "Content similarity contribution: 0.550"
  },
  {
    "hotel_id": 2111,
    "hotel_name": "Hotel Aauris",
    "city": "Kolkata",
    "price_per_night": 4650.0,

In [13]:
# Save artifacts cell
import os
import json
import tarfile
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, SVD

# -----------------------------
# (Re)define ContentBasedRecommender (copy from your code)
# -----------------------------
class ContentBasedRecommender:
    def __init__(self):
        self.hotels_df = None
        self.feature_weights = {
            'amenities': 0.35,
            'location': 0.25,
            'price': 0.20,
            'star_rating': 0.15,
            'property_type': 0.05
        }
        self.amenity_features = [
            'amenity_free_wifi', 'amenity_air_conditioning', 'amenity_parking', 'amenity_room_service',
            'amenity_24_7_front_desk', 'amenity_restaurant', 'amenity_gym', 'amenity_pool', 'amenity_spa',
            'amenity_business_center', 'amenity_conference_hall', 'amenity_airport_shuttle',
            'amenity_meeting_rooms', 'amenity_vegetarian_restaurant'
        ]
        self.scaler = MinMaxScaler()

    def load_data(self, hotels_df: pd.DataFrame):
        self.hotels_df = hotels_df.copy()
        for amenity in self.amenity_features:
            if amenity in self.hotels_df.columns:
                self.hotels_df[amenity] = self.hotels_df[amenity].fillna(0).astype(int)
            else:
                self.hotels_df[amenity] = 0

        city_col = 'city' if 'city' in self.hotels_df.columns else ('location' if 'location' in self.hotels_df.columns else None)
        if city_col is not None:
            location_dummies = pd.get_dummies(self.hotels_df[city_col].fillna('Unknown'), prefix='city')
            self.hotels_df = pd.concat([self.hotels_df, location_dummies], axis=1)
        else:
            location_dummies = pd.DataFrame(index=self.hotels_df.index)

        price_col = 'price_per_night' if 'price_per_night' in self.hotels_df.columns else ('price_per_night_inr' if 'price_per_night_inr' in self.hotels_df.columns else 'price_per_night_inr')
        rating_col = 'star_rating' if 'star_rating' in self.hotels_df.columns else ('star_rating_clean' if 'star_rating_clean' in self.hotels_df.columns else None)

        if price_col in self.hotels_df.columns:
            self.hotels_df[price_col] = pd.to_numeric(self.hotels_df[price_col], errors='coerce').fillna(self.hotels_df[price_col].median())
            self.hotels_df['price_normalized'] = self.scaler.fit_transform(self.hotels_df[[price_col]])
        else:
            self.hotels_df['price_normalized'] = 0.0

        if rating_col:
            self.hotels_df[rating_col] = pd.to_numeric(self.hotels_df[rating_col], errors='coerce').fillna(self.hotels_df[rating_col].median())
            self.hotels_df['rating_normalized'] = self.scaler.fit_transform(self.hotels_df[[rating_col]])
        else:
            self.hotels_df['rating_normalized'] = 0.0

        property_col = 'hotel_type' if 'hotel_type' in self.hotels_df.columns else ('property_type' if 'property_type' in self.hotels_df.columns else None)
        if property_col:
            property_dummies = pd.get_dummies(self.hotels_df[property_col].fillna('Unknown'), prefix='property')
            self.hotels_df = pd.concat([self.hotels_df, property_dummies], axis=1)
        else:
            property_dummies = pd.DataFrame(index=self.hotels_df.index)

        self.feature_columns = (
            self.amenity_features +
            list(location_dummies.columns) +
            ['price_normalized', 'rating_normalized'] +
            list(property_dummies.columns)
        )

        for col in self.feature_columns:
            if col not in self.hotels_df.columns:
                self.hotels_df[col] = 0

        self.hotels_df['feature_vector'] = self.hotels_df[self.feature_columns].values.tolist()
        return self

# -----------------------------
# Paths (use exactly your provided Kaggle input paths)
# -----------------------------
INPUT_HOTELS = "/kaggle/input/hotel-master/hotels_master.csv"
INPUT_PROCESSED = "/kaggle/input/preprocessed-data/preprocessed_hotel_data_final_new.csv"
INPUT_INTERACTIONS = "/kaggle/input/user-hotel-interactions/user_hotel_interactions.csv"
INPUT_PROFILES = "/kaggle/input/user-profiles/user_profiles.csv"

OUT_DIR = "/kaggle/working/recommender_artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# 1) Load all CSVs (exact files you specified)
# -----------------------------
print("Reading input CSVs...")
hotels_df = pd.read_csv(INPUT_HOTELS)
processed_data = pd.read_csv(INPUT_PROCESSED)
user_hotel_interactions = pd.read_csv(INPUT_INTERACTIONS)
user_profiles = pd.read_csv(INPUT_PROFILES)

print("hotels_df:", hotels_df.shape)
print("processed_data:", processed_data.shape)
print("user_hotel_interactions:", user_hotel_interactions.shape)
print("user_profiles:", user_profiles.shape)

# -----------------------------
# 2) Initialize / fit ContentBasedRecommender on processed_data
# -----------------------------
print("Building ContentBasedRecommender and feature vectors...")
cb_recommender = ContentBasedRecommender()
cb_recommender.load_data(processed_data)

# -----------------------------
# 3) Train Surprise SVD collaborative model (using user_hotel_interactions)
# -----------------------------
print("Preparing collaborative dataset and training SVD (this may take a moment)...")
# Ensure rating numeric
user_hotel_interactions['rating'] = pd.to_numeric(user_hotel_interactions.get('rating', 0), errors='coerce').fillna(0)

# Surprise dataset expects user, item, rating columns
from surprise import Dataset, Reader, SVD
reader = Reader(rating_scale=(0, 5))
svd_data = Dataset.load_from_df(user_hotel_interactions[['user_id', 'hotel_id', 'rating']], reader)
svd_algo = SVD(n_factors=50, random_state=42)
trainset = svd_data.build_full_trainset()
svd_algo.fit(trainset)
print("SVD training finished.")

# -----------------------------
# 4) Create hotel mapping and lists
# -----------------------------
hotel_mapping = processed_data[['hotel_id', 'hotel_name']].drop_duplicates()
all_hotels = processed_data['hotel_id'].unique().tolist()

# -----------------------------
# 5) Save artifacts
# -----------------------------
print("Saving artifacts to:", OUT_DIR)

# Save copies of CSVs
processed_csv_out = os.path.join(OUT_DIR, "processed_data.csv")
hotels_csv_out = os.path.join(OUT_DIR, "hotels_master.csv")
interactions_csv_out = os.path.join(OUT_DIR, "user_hotel_interactions.csv")
profiles_csv_out = os.path.join(OUT_DIR, "user_profiles.csv")

processed_data.to_csv(processed_csv_out, index=False)
hotels_df.to_csv(hotels_csv_out, index=False)
user_hotel_interactions.to_csv(interactions_csv_out, index=False)
user_profiles.to_csv(profiles_csv_out, index=False)
hotel_mapping.to_csv(os.path.join(OUT_DIR, "hotel_mapping.csv"), index=False)

print("Saved CSV copies.")

# Save SVD model (joblib)
svd_path = os.path.join(OUT_DIR, "svd_model.joblib")
joblib.dump(svd_algo, svd_path)
print("Saved SVD model ->", svd_path)

# Save content-based recommender (joblib)
cb_path = os.path.join(OUT_DIR, "cb_recommender.joblib")
try:
    joblib.dump(cb_recommender, cb_path)
    print("Saved ContentBasedRecommender ->", cb_path)
except Exception as e:
    # If pickling the entire instance fails, fallback: save the feature_columns and rely on CSV for rebuilding
    print("Warning: failed to joblib.dump(cb_recommender):", e)
    cb_path = None

# Save metadata (feature columns, amenity list, all_hotels)
meta = {
    "feature_columns": getattr(cb_recommender, "feature_columns", None),
    "amenity_features": getattr(cb_recommender, "amenity_features", None),
    "all_hotels": all_hotels
}
meta_path = os.path.join(OUT_DIR, "metadata.json")
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print("Saved metadata ->", meta_path)

# Create tarball for easy download
tar_path = os.path.join("/kaggle/working", "recommender_artifacts.tar.gz")
with tarfile.open(tar_path, "w:gz") as tar:
    tar.add(OUT_DIR, arcname=os.path.basename(OUT_DIR))
print("Created tarball ->", tar_path)

print("Done. You can download these files from the Kaggle 'Files' panel:")
for p in [processed_csv_out, hotels_csv_out, interactions_csv_out, profiles_csv_out, svd_path, cb_path, meta_path, tar_path]:
    if p:
        print(" -", p)


Reading input CSVs...
hotels_df: (400, 27)
processed_data: (4632, 39)
user_hotel_interactions: (1288, 11)
user_profiles: (300, 16)
Building ContentBasedRecommender and feature vectors...
Preparing collaborative dataset and training SVD (this may take a moment)...
SVD training finished.
Saving artifacts to: /kaggle/working/recommender_artifacts
Saved CSV copies.
Saved SVD model -> /kaggle/working/recommender_artifacts/svd_model.joblib
Saved ContentBasedRecommender -> /kaggle/working/recommender_artifacts/cb_recommender.joblib
Saved metadata -> /kaggle/working/recommender_artifacts/metadata.json
Created tarball -> /kaggle/working/recommender_artifacts.tar.gz
Done. You can download these files from the Kaggle 'Files' panel:
 - /kaggle/working/recommender_artifacts/processed_data.csv
 - /kaggle/working/recommender_artifacts/hotels_master.csv
 - /kaggle/working/recommender_artifacts/user_hotel_interactions.csv
 - /kaggle/working/recommender_artifacts/user_profiles.csv
 - /kaggle/working/rec

In [9]:
import sys
import platform
import importlib

# List of packages you want to check versions for
packages = [
    "pandas",
    "numpy",
    "scikit-learn",
    "scipy",
    "surprise",
    "joblib",
    "fastapi",
    "uvicorn",
    "langchain",
    "langchain_google_genai",
    "google_api_core"
]

print("="*40)
print("Python:", sys.version)
print("Platform:", platform.platform())
print("="*40)

for pkg in packages:
    try:
        module = importlib.import_module(pkg.replace("-", "_"))
        version = getattr(module, "__version__", "unknown")
        print(f"{pkg:25s} {version}")
    except ImportError:
        print(f"{pkg:25s} NOT INSTALLED")


Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
Platform: Linux-6.6.56+-x86_64-with-glibc2.35
pandas                    2.2.3
numpy                     1.26.4
scikit-learn              NOT INSTALLED
scipy                     1.15.3
surprise                  1.1.4
joblib                    1.5.1
fastapi                   0.115.13
uvicorn                   0.34.3
langchain                 0.3.27
langchain_google_genai    unknown
google_api_core           NOT INSTALLED
