In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
import json
import warnings
warnings.filterwarnings('ignore')

class ContentBasedRecommender:
    """Content-based filtering for hotel recommendations"""
    
    def __init__(self):
        self.hotels_df = None
        self.feature_weights = {
            'amenities': 0.35,
            'location': 0.25,
            'price': 0.20,
            'star_rating': 0.15,
            'property_type': 0.05
        }
        self.amenity_features = [
            'free_wifi', 'air_conditioning', 'parking', 'room_service',
            '24_7_front_desk', 'restaurant', 'gym', 'pool', 'spa',
            'business_center', 'conference_hall', 'airport_shuttle', 
            'meeting_rooms', 'vegetarian_restaurant', 'multilingual_staff',
            'local_cuisine'
        ]
        self.scaler = MinMaxScaler()
        self.vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        
    def load_data(self, hotels_df):
        """Load and preprocess hotel data"""
        self.hotels_df = hotels_df.copy()
        
        # Ensure boolean columns are properly formatted
        for amenity in self.amenity_features:
            if amenity in self.hotels_df.columns:
                self.hotels_df[amenity] = self.hotels_df[amenity].astype(int)
        
        # Preprocess text features
        self._create_feature_vectors()
        
        return self
    
    def _create_feature_vectors(self):
        """Create feature vectors for content-based filtering"""
        # Create amenities vector
        self.hotels_df['amenities_vector'] = self.hotels_df[self.amenity_features].values.tolist()
        
        # Create location features (one-hot encoding for cities)
        location_dummies = pd.get_dummies(self.hotels_df['city'], prefix='city')
        self.hotels_df = pd.concat([self.hotels_df, location_dummies], axis=1)
        
        # Normalize numerical features
        self.hotels_df['price_normalized'] = self.scaler.fit_transform(
            self.hotels_df[['price_per_night_inr']])
        self.hotels_df['rating_normalized'] = self.scaler.fit_transform(
            self.hotels_df[['star_rating']])
        
        # Create property type features
        property_dummies = pd.get_dummies(self.hotels_df['property_type'], prefix='property')
        self.hotels_df = pd.concat([self.hotels_df, property_dummies], axis=1)
        
        # Get all feature columns for similarity calculation
        self.feature_columns = (
            self.amenity_features + 
            list(location_dummies.columns) + 
            ['price_normalized', 'rating_normalized'] +
            list(property_dummies.columns)
        )
        
        # Create combined feature vector
        self.hotels_df['feature_vector'] = self.hotels_df[self.feature_columns].values.tolist()
        
        return self
    
    def _create_user_profile(self, user_preferences):
        """Create a user profile vector based on preferences"""
        user_vector = np.zeros(len(self.feature_columns))
        
        # Map feature names to indices
        feature_idx_map = {feature: idx for idx, feature in enumerate(self.feature_columns)}
        
        # Set location preferences
        if 'city' in user_preferences:
            city_feature = f"city_{user_preferences['city']}"
            if city_feature in feature_idx_map:
                user_vector[feature_idx_map[city_feature]] = 1.0
        
        # Set amenity preferences
        if 'preferred_amenities' in user_preferences:
            if isinstance(user_preferences['preferred_amenities'], str):
                preferred_amenities = json.loads(user_preferences['preferred_amenities'])
            else:
                preferred_amenities = user_preferences['preferred_amenities']
                
            for amenity in preferred_amenities:
                if amenity in feature_idx_map:
                    user_vector[feature_idx_map[amenity]] = 1.0
        
        # Set budget preferences (inverse weight for price - lower price is better within budget)
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            budget_min = user_preferences['budget_min_inr']
            budget_max = user_preferences['budget_max_inr']
            # Price preference will be handled in scoring function
        
        # Set star rating preference
        if 'min_star_rating' in user_preferences:
            min_rating = user_preferences['min_star_rating']
            # Rating preference will be handled in scoring function
        
        return user_vector
    
    def calculate_similarity(self, user_preferences, top_n=10):
        """Calculate similarity between user preferences and hotels"""
        if self.hotels_df is None:
            raise ValueError("Please load hotel data first using load_data()")
        
        # Create user profile
        user_vector = self._create_user_profile(user_preferences)
        
        # Calculate cosine similarity
        hotel_vectors = np.array(self.hotels_df['feature_vector'].tolist())
        similarities = cosine_similarity([user_vector], hotel_vectors)[0]
        
        # Apply additional filters and scoring
        filtered_indices = self._apply_filters(user_preferences)
        filtered_similarities = similarities[filtered_indices]
        
        # Get top N recommendations
        top_indices = filtered_indices[np.argsort(filtered_similarities)[-top_n:][::-1]]
        
        return top_indices, similarities[top_indices]
    
    def _apply_filters(self, user_preferences):
        """Apply filters based on user preferences"""
        mask = pd.Series(True, index=self.hotels_df.index)
        
        # City filter
        if 'city' in user_preferences:
            mask &= (self.hotels_df['city'] == user_preferences['city'])
        
        # Budget filter
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            budget_min = user_preferences['budget_min_inr']
            budget_max = user_preferences['budget_max_inr']
            mask &= (self.hotels_df['price_per_night_inr'] >= budget_min) & \
                   (self.hotels_df['price_per_night_inr'] <= budget_max)
        
        # Star rating filter
        if 'min_star_rating' in user_preferences:
            min_rating = user_preferences['min_star_rating']
            mask &= (self.hotels_df['star_rating'] >= min_rating)
        
        # Amenity filters
        if 'required_amenities' in user_preferences:
            if isinstance(user_preferences['required_amenities'], str):
                required_amenities = json.loads(user_preferences['required_amenities'])
            else:
                required_amenities = user_preferences['required_amenities']
                
            for amenity in required_amenities:
                if amenity in self.hotels_df.columns:
                    mask &= (self.hotels_df[amenity] == 1)
        
        return mask[mask].index
    
    def recommend(self, user_preferences, top_n=5):
        """Get top hotel recommendations with explanations"""
        top_indices, similarities = self.calculate_similarity(user_preferences, top_n * 2)
        
        recommendations = []
        for idx, hotel_idx in enumerate(top_indices[:top_n]):
            hotel = self.hotels_df.iloc[hotel_idx]
            explanation = self._generate_explanation(user_preferences, hotel, similarities[idx])
            
            recommendations.append({
                'hotel_id': hotel['hotel_id'],
                'name': hotel['name'],
                'city': hotel['city'],
                'price_per_night_inr': hotel['price_per_night_inr'],
                'star_rating': hotel['star_rating'],
                'similarity_score': float(similarities[idx]),
                'explanation': explanation,
                'features': self._get_top_features(user_preferences, hotel)
            })
        
        return recommendations
    
    def _generate_explanation(self, user_preferences, hotel, similarity_score):
        """Generate natural language explanation for recommendation"""
        explanations = []
        
        # Location match
        if 'city' in user_preferences and user_preferences['city'] == hotel['city']:
            explanations.append(f"Located in your preferred city {hotel['city']}")
        
        # Amenity matches
        if 'preferred_amenities' in user_preferences:
            if isinstance(user_preferences['preferred_amenities'], str):
                preferred_amenities = json.loads(user_preferences['preferred_amenities'])
            else:
                preferred_amenities = user_preferences['preferred_amenities']
                
            matched_amenities = []
            for amenity in preferred_amenities:
                if amenity in hotel and hotel[amenity] == 1:
                    matched_amenities.append(amenity.replace('_', ' '))
            
            if matched_amenities:
                explanations.append(f"Has your preferred amenities: {', '.join(matched_amenities[:3])}")
        
        # Budget alignment
        if 'budget_min_inr' in user_preferences and 'budget_max_inr' in user_preferences:
            budget_min = user_preferences['budget_min_inr']
            budget_max = user_preferences['budget_max_inr']
            price = hotel['price_per_night_inr']
            
            if budget_min <= price <= budget_max:
                explanations.append(f"Fits your budget (₹{price:,} per night)")
            elif price < budget_min:
                explanations.append(f"Below your budget (₹{price:,} per night)")
        
        # Star rating
        if 'min_star_rating' in user_preferences:
            min_rating = user_preferences['min_star_rating']
            if hotel['star_rating'] >= min_rating:
                explanations.append(f"Meets your {min_rating}+ star requirement")
        
        # If no specific explanations, use similarity score
        if not explanations:
            explanations.append(f"High similarity ({similarity_score:.2f}) to your preferences")
        
        return ". ".join(explanations) + "."
    
    def _get_top_features(self, user_preferences, hotel):
        """Get top features that contributed to the recommendation"""
        user_vector = self._create_user_profile(user_preferences)
        hotel_vector = np.array(hotel['feature_vector'])
        
        # Calculate feature contributions
        contributions = user_vector * hotel_vector
        
        # Get top contributing features
        feature_contributions = []
        for idx, contribution in enumerate(contributions):
            if contribution > 0:
                feature_contributions.append((self.feature_columns[idx], contribution))
        
        # Sort by contribution and return top 5
        feature_contributions.sort(key=lambda x: x[1], reverse=True)
        
        return feature_contributions[:5]
    
    def evaluate(self, test_data, true_interactions):
        """Evaluate the model using precision, recall, and F1-score"""
        predictions = []
        true_labels = []
        
        for user_id, user_data in test_data.iterrows():
            # Get recommendations for this user
            user_preferences = user_data.to_dict()
            recommendations = self.recommend(user_preferences, top_n=5)
            recommended_ids = [rec['hotel_id'] for rec in recommendations]
            
            # Get true interactions for this user
            user_true_interactions = true_interactions[true_interactions['user_id'] == user_id]
            true_positive_ids = user_true_interactions['hotel_id'].tolist()
            
            # Create binary labels
            for hotel_id in self.hotels_df['hotel_id']:
                pred_label = 1 if hotel_id in recommended_ids else 0
                true_label = 1 if hotel_id in true_positive_ids else 0
                
                predictions.append(pred_label)
                true_labels.append(true_label)
        
        # Calculate metrics
        precision = precision_score(true_labels, predictions, zero_division=0)
        recall = recall_score(true_labels, predictions, zero_division=0)
        f1 = f1_score(true_labels, predictions, zero_division=0)
        
        return {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'num_test_users': len(test_data)
        }
        

# Example usage and test
def test_content_based_recommender():
    """Test the content-based recommender with sample data"""
    # Load your hotel data (replace with your actual data loading)
    hotels_df = pd.read_csv('data/processed/hotels_master.csv')
    
    # Initialize and load data
    recommender = ContentBasedRecommender()
    recommender.load_data(hotels_df)
    
    # Sample user preferences
    user_preferences = {
        'city': 'Mumbai',
        'preferred_amenities': ['free_wifi', 'air_conditioning', 'pool', 'business_center'],
        'budget_min_inr': 3000,
        'budget_max_inr': 8000,
        'min_star_rating': 4
    }
    
    # Get recommendations
    recommendations = recommender.recommend(user_preferences, top_n=5)
    
    print("Top 5 Recommendations:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec['name']} - ₹{rec['price_per_night_inr']:,} - {rec['star_rating']} stars")
        print(f"   Similarity: {rec['similarity_score']:.3f}")
        print(f"   Explanation: {rec['explanation']}")
        print(f"   Top features: {rec['features']}")
        print()
    
    return recommendations

if __name__ == "__main__":
    # Test the recommender
    recommendations = test_content_based_recommender()

Top 5 Recommendations:
1. ITC Minhas-Sami Mumbai - ₹7,189 - 4 stars
   Similarity: 0.607
   Explanation: Located in your preferred city Mumbai. Has your preferred amenities: free wifi, air conditioning, pool. Fits your budget (₹7,189 per night). Meets your 4+ star requirement.
   Top features: [('free_wifi', np.float64(1.0)), ('air_conditioning', np.float64(1.0)), ('pool', np.float64(1.0)), ('business_center', np.float64(1.0)), ('city_Mumbai', np.float64(1.0))]

2. ITC Gade, Mander and Singhal Mumbai - ₹6,965 - 4 stars
   Similarity: 0.579
   Explanation: Located in your preferred city Mumbai. Has your preferred amenities: free wifi, air conditioning, pool. Fits your budget (₹6,965 per night). Meets your 4+ star requirement.
   Top features: [('free_wifi', np.float64(1.0)), ('air_conditioning', np.float64(1.0)), ('pool', np.float64(1.0)), ('city_Mumbai', np.float64(1.0))]

3. Grand Upadhyay, Tata and Ranganathan Mumbai - ₹7,911 - 4 stars
   Similarity: 0.575
   Explanation: Located in 