In [48]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.metrics import ndcg_score

## Data Loading

In [7]:
users = pd.read_csv('users_synthesized.csv')
places = pd.read_csv('tourism_place_synthesized.csv')
clicks = pd.read_csv('tourism_click_history.csv')
searches = pd.read_csv('tourism_search_history.csv')
ratings = pd.read_csv('tourism_rating.csv') 

## Building

In [9]:
# 1. Content-Based Filtering Components
def prepare_content_features():
    # User features
    mlb = MultiLabelBinarizer()
    user_cats = mlb.fit_transform(users['Preferred_Categories'].apply(eval))
    user_tags = mlb.fit_transform(users['Interest_Tags'].apply(eval))
    
    # Place features
    place_cats = mlb.fit_transform(places['Category'].apply(lambda x: [x]))
    place_desc_tfidf = TfidfVectorizer().fit_transform(places['Description'])
    
    return {
        'user_features': np.hstack([user_cats, user_tags]),
        'place_features': np.hstack([place_cats, place_desc_tfidf.toarray()]),
        'transformers': mlb
    }

In [10]:
# 2. Collaborative Filtering Components
def prepare_collab_data():
    # Create user-item matrix
    click_counts = clicks.groupby(['User_Id', 'Place_Id']).size().unstack(fill_value=0)
    search_counts = searches.groupby(['User_Id', 'Implied_Place_Id']).size().unstack(fill_value=0)
    
    # Combine interactions
    interaction_matrix = click_counts.add(search_counts, fill_value=0)
    return interaction_matrix.fillna(0)

In [29]:
# 3. Hybrid Model
class HybridRecommender:
    def __init__(self, content_weight=0.6, collab_weight=0.4):
        self.content_model = NearestNeighbors(metric='cosine')
        self.collab_model = NearestNeighbors(metric='cosine')
        self.weights = [content_weight, collab_weight]
        self.content_features = None
        self.collab_matrix = None
        
    def fit(self, content_features, collab_matrix):
        # Simpan data training
        self.content_features = content_features
        self.collab_matrix = collab_matrix
        
        # Train model
        self.content_model.fit(content_features)
        self.collab_model.fit(collab_matrix)
        
    def recommend(self, user_idx, k=10):
        # Content-based
        content_dist, content_idx = self.content_model.kneighbors(
            [self.content_features[user_idx]], 
            n_neighbors=k*2
        )
        
        # Collaborative
        collab_dist, collab_idx = self.collab_model.kneighbors(
            [self.collab_matrix.iloc[user_idx].values],
            n_neighbors=k*2
        )
        
        # Hybrid scoring
        combined = pd.DataFrame({
            'place_id': np.concatenate([content_idx[0], collab_idx[0]]),
            'score': np.concatenate([
                1 - content_dist[0] * self.weights[0],
                1 - collab_dist[0] * self.weights[1]
            ])
        }).sort_values('score', ascending=False).head(k)
        
        return combined['place_id'].tolist()

In [30]:
# Prepare data
content_data = prepare_content_features()
collab_matrix = prepare_collab_data()

In [31]:
# Train model
model = HybridRecommender()
model.fit(content_data['user_features'], collab_matrix)

In [32]:
# Save model and transformers
joblib.dump({
    'model': model,
    'content_transformers': content_data['transformers'],
    'tfidf_vectorizer': TfidfVectorizer().fit(places['Description']),
    'user_mapping': dict(enumerate(users['User_Id'])),
    'place_mapping': dict(enumerate(places['Place_Id']))
}, 'hybrid_recommender.joblib')

['hybrid_recommender.joblib']

## Contoh Inference

In [33]:
# Load model menggunakan joblib
rec_model = joblib.load('hybrid_recommender.joblib')
hybrid_model = rec_model['model']
mlb = rec_model['content_transformers']
tfidf = rec_model['tfidf_vectorizer']
user_mapping = rec_model['user_mapping']
place_mapping = rec_model['place_mapping']

In [34]:
# Reverse Mapping
inv_user_mapping = {v: k for k, v in user_mapping.items()}
inv_place_mapping = {v: k for k, v in place_mapping.items()}

In [35]:
# Load your data (Di server aslinya nanti akan ngambil data dari Database Wistara)
users = pd.read_csv('users_synthesized.csv')
places = pd.read_csv('tourism_place_synthesized.csv')

In [36]:
def prepare_user_features(user_row):
    """Mengconvert feature dari seorang user secara individu jadi input untuk ke model"""
    # Memproses Kategori
    user_cats = mlb.transform([ast.literal_eval(user_row['Preferred_Categories'])])
    
    # Proses Tags
    user_tags = mlb.transform([ast.literal_eval(user_row['Interest_Tags'])])
    
    # Satukan fitur
    return np.hstack([user_cats, ])

# Sebenarnya user feature nya ada Location Age Coordinates Visited_Places Price_Range Interest_Tags Preferred_Categories Min_Rating

In [37]:
def get_place_details(place_ids):
    """Mengambil seluruh data dari satu tempat yang akan direkomendasikan"""
    return places[places['Place_Id'].isin(place_ids)]

In [46]:
# TEST REKOMENDASI UNTUK SEORANG USER

# Pilih Seorang User (9 itu index user-nya)
test_user_id = users.iloc[211]['User_Id']  
print(f"Testing recommendations for user: {test_user_id}")

# Reverse mapping semua data si user-nya
user_idx = inv_user_mapping[test_user_id]

# Generate rekomendasi ()
recommended_place_indices = hybrid_model.recommend(user_idx, k=5)

# Convert balik id yang direkomendasikan ke id tempat tersebut
recommended_place_ids = [place_mapping[idx] for idx in recommended_place_indices]

# Get detail tempat tempat yang direkomendasikan
recommendations = get_place_details(recommended_place_ids)


Testing recommendations for user: 212


In [47]:
# Display results
print("\nTop Recommendations:")
display(recommendations[['Place_Id', 'Place_Name', 'Category', 'City', 'Rating', 'Price', 'Tags']])


Top Recommendations:


Unnamed: 0,Place_Id,Place_Name,Category,City,Rating,Price,Tags
33,34,Pasar Taman Puring,Pusat Perbelanjaan,Jakarta,12.2,0,"museum, local, art, island, free"
42,43,The Escape Hunt,Taman Hiburan,Jakarta,12.2,70000,"heritage, extreme, forest, adventure, family"
85,86,Keraton Yogyakarta,Budaya,Yogyakarta,14.2,15000,"cultural, extreme, forest, festival, religious"
211,212,Jalan Braga,Budaya,Bandung,15.2,0,"romantic, modern, solo, camping, free"


In [40]:
# Show user's preferences for comparison
print("\nUser Preferences:")
user_prefs = users[users['User_Id'] == test_user_id][[
    'Preferred_Categories', 'Interest_Tags', 'Price_Range', 'Min_Rating'
]]
display(user_prefs)


User Preferences:


Unnamed: 0,Preferred_Categories,Interest_Tags,Price_Range,Min_Rating
9,"['Budaya', 'Taman Hiburan']","['cultural', 'family', 'family']",10890-110890,4.5


Contoh Format Response saat server diimplementasikan :

```
{
  "user_id": 12345,
  "recommendations": [
    {
      "place_id": 678,
      "name": "Tanah Lot Temple",
      "category": "Cultural",
      "city": "Bali",
      "rating": 4.8,
      "price_range": "IDR 50,000-100,000",
      "match_score": 0.92
    }
  ],
  "context": {
    "model_version": "1.2",
    "generated_at": "2023-12-25T14:30:00Z"
  }
}
```

## Evaluation

In [51]:
def evaluate_recommendations(model, test_users, ground_truth, k=5):
    """
    Evaluasi model rekomendasi
    :param model: Model yang sudah di-trained
    :param test_users: Daftar user ID untuk testing
    :param ground_truth: Dict {user_id: list[place_id]} interaksi aktual
    :param k: Jumlah rekomendasi
    """
    precisions = []
    recalls = []
    ndcgs = []
    
    for user_id in test_users:
        # Dapatkan rekomendasi
        user_idx = inv_user_mapping[user_id]
        recommended = set(model.recommend(user_idx, k=k))
        
        # Dapatkan ground truth
        actual = set(ground_truth.get(user_id, []))
        
        # Hitung metrik
        relevant = recommended & actual
        precision = len(relevant) / k
        recall = len(relevant) / len(actual) if len(actual) > 0 else 0
        
        # Hitung NDCG
        ideal_ranking = sorted(actual, reverse=True)
        user_scores = [1 if p in actual else 0 for p in recommended]
        ndcg = ndcg_score([ideal_ranking], [user_scores])
        
        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
    
    return {
        'precision@k': np.mean(precisions),
        'recall@k': np.mean(recalls),
        'ndcg@k': np.mean(ndcgs)
    }

# Contoh penggunaan
test_user_ids = users.sample(10)['User_Id'].tolist()
ground_truth = {
    # Contoh data: {user_id: [place_id1, place_id2]}
    123: [45, 89, 120],
    456: [78, 92],
    # ...
}

metrics = evaluate_recommendations(hybrid_model, test_user_ids, ground_truth)
print(f"Precision@5: {metrics['precision@k']:.2f}")
print(f"Recall@5: {metrics['recall@k']:.2f}")
print(f"NDCG@5: {metrics['ndcg@k']:.2f}")

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.