In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

## Data Loading

In [7]:
users = pd.read_csv('users_synthesized.csv')
places = pd.read_csv('tourism_place_synthesized.csv')
clicks = pd.read_csv('tourism_click_history.csv')
searches = pd.read_csv('tourism_search_history.csv')
ratings = pd.read_csv('tourism_rating.csv') 

## Building

In [9]:
# 1. Content-Based Filtering Components
def prepare_content_features():
    # User features
    mlb = MultiLabelBinarizer()
    user_cats = mlb.fit_transform(users['Preferred_Categories'].apply(eval))
    user_tags = mlb.fit_transform(users['Interest_Tags'].apply(eval))
    
    # Place features
    place_cats = mlb.fit_transform(places['Category'].apply(lambda x: [x]))
    place_desc_tfidf = TfidfVectorizer().fit_transform(places['Description'])
    
    return {
        'user_features': np.hstack([user_cats, user_tags]),
        'place_features': np.hstack([place_cats, place_desc_tfidf.toarray()]),
        'transformers': mlb
    }

In [10]:
# 2. Collaborative Filtering Components
def prepare_collab_data():
    # Create user-item matrix
    click_counts = clicks.groupby(['User_Id', 'Place_Id']).size().unstack(fill_value=0)
    search_counts = searches.groupby(['User_Id', 'Implied_Place_Id']).size().unstack(fill_value=0)
    
    # Combine interactions
    interaction_matrix = click_counts.add(search_counts, fill_value=0)
    return interaction_matrix.fillna(0)

In [11]:
# 3. Hybrid Model
class HybridRecommender:
    def __init__(self, content_weight=0.6, collab_weight=0.4):
        self.content_model = NearestNeighbors(metric='cosine')
        self.collab_model = NearestNeighbors(metric='cosine')
        self.weights = [content_weight, collab_weight]
        
    def fit(self, content_features, collab_matrix):
        self.content_model.fit(content_features)
        self.collab_model.fit(collab_matrix)
        
    def recommend(self, user_idx, k=10):
        # Content-based
        content_dist, content_idx = self.content_model.kneighbors(
            [content_features[user_idx]], n_neighbors=k*2)
        
        # Collaborative
        collab_dist, collab_idx = self.collab_model.kneighbors(
            [collab_matrix.iloc[user_idx].values], n_neighbors=k*2)
        
        # Hybrid scoring
        combined = pd.DataFrame({
            'place_id': np.concatenate([content_idx[0], collab_idx[0]]),
            'score': np.concatenate([
                1 - content_dist[0] * self.weights[0],
                1 - collab_dist[0] * self.weights[1]
            ])
        }).sort_values('score', ascending=False).head(k)
        
        return combined['place_id'].tolist()


In [12]:
# Prepare data
content_data = prepare_content_features()
collab_matrix = prepare_collab_data()

In [13]:
# Train model
model = HybridRecommender()
model.fit(content_data['user_features'], collab_matrix)

In [14]:
# Save model and transformers
joblib.dump({
    'model': model,
    'content_transformers': content_data['transformers'],
    'tfidf_vectorizer': TfidfVectorizer().fit(places['Description']),
    'user_mapping': dict(enumerate(users['User_Id'])),
    'place_mapping': dict(enumerate(places['Place_Id']))
}, 'hybrid_recommender.joblib')

['hybrid_recommender.joblib']