# Indo EcoTourism — Content‑Based Filtering (CBF)

- `artifacts/vectorizer.joblib` — TF‑IDF vectorizer
- `artifacts/tfidf_matrix.npz` — matriks fitur item (sparse)
- `artifacts/nbrs_cosine.joblib` — (opsional) indeks NearestNeighbors cosine
- `artifacts/items.csv` — metadata item yang sudah dibersihkan

## 1) Setup & Install

In [1]:
# !pip install -q pandas numpy scikit-learn scipy joblib
import os, re, json, math, random, warnings, datetime as dt
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import save_npz, load_npz
import joblib

SEED = 13
random.seed(SEED)
np.random.seed(SEED)

# Output dirs
BASE_DIR = Path(".")
ART_DIR = BASE_DIR / "artifacts"
ART_DIR.mkdir(exist_ok=True, parents=True)

print("Artifacts dir:", ART_DIR.resolve())

Artifacts dir: /content/artifacts


## 2) Siapkan Data
Pilih salah satu: unggah file `eco_place.csv`, atau letakkan file di path yang sudah diketahui, atau gunakan sel Kaggle opsional.

In [2]:
# Opsi A — manual path (disarankan untuk re-run di luar Colab)
DATA_CSV_PATH = Path("./eco_place.csv")  # ganti jika perlu

if not DATA_CSV_PATH.exists():
    print("[Info] eco_place.csv tidak ditemukan di:", DATA_CSV_PATH.resolve())
    print("       Unggah file atau jalankan Opsi B (Colab upload) atau Opsi C (Kaggle).")
else:
    print("[OK] Ditemukan:", DATA_CSV_PATH.resolve())

[OK] Ditemukan: /content/eco_place.csv


In [5]:
# Load data
if not DATA_CSV_PATH.exists():
    raise FileNotFoundError("eco_place.csv tidak ditemukan. Pastikan file tersedia melalui Opsi A/B/C.")
df = pd.read_csv(DATA_CSV_PATH)
print("Shape:", df.shape)
df.head(3)

Shape: (182, 13)


Unnamed: 0,place_id,place_name,place_description,category,city,price,rating,description_location,place_img,gallery_photo_img1,gallery_photo_img2,gallery_photo_img3,place_map
0,1,Taman Nasional Gunung Leuser,Taman Nasional Gunung Leuser adalah salah satu...,"Budaya,Taman Nasional",Aceh,"Rp25,000",4.5,"Barisan mountain range, Aceh 24653",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://www.google.com/maps/search/Taman+Nasio...
1,2,Desa Wisata Munduk,Desa Wisata Munduk adalah sebuah desa di pegun...,Desa Wisata,Bali,"Rp10,000",4.5,"Munduk, Banjar, Kabupaten Buleleng, Bali",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://goo.gl/maps/LyeJ2mAeFGysTE9v9
2,3,Desa Wisata Penglipuran,Desa Wisata Penglipuran adalah sebuah desa wis...,"Budaya,Desa Wisata",Bali,"Rp25,000",4.8,"Jl. Penglipuran, Kubu, Kec. Bangli, Kabupaten ...",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://www.google.com/maps/search/Desa+Wisata...


## 3) Cleaning & Normalization

In [6]:
df = df.copy()

# Standardize required columns existence
for col in ["place_name","place_description","category","city","price","rating","place_img","place_map"]:
    if col not in df.columns:
        df[col] = np.nan

# Remove duplicates
before = len(df)
df = df.drop_duplicates().reset_index(drop=True)
print(f"Duplicates removed: {before - len(df)}")

# Clean price (IDR)
def parse_price_idr(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    s = s.replace("Rp","").replace("rp","")
    s = re.sub(r"[.,]", "", s)  # remove thousand sep
    s = s.replace("Gratis","0").replace("gratis","0").strip()
    # handle ranges e.g., "10000-20000"
    m = re.findall(r"\d+", s)
    if not m:
        return np.nan
    try:
        vals = list(map(int, m))
        return float(int(sum(vals)/len(vals)))  # average if range
    except Exception:
        return np.nan

df["price"] = df["price"].apply(parse_price_idr)

# rating numeric
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# fill text cols
for c in ["place_description","category","city","place_name"]:
    df[c] = df[c].fillna("").astype(str)

# Basic null report
print("Nulls after cleaning:")
print(df.isnull().sum())
df.head(3)

Duplicates removed: 0
Nulls after cleaning:
place_id                 0
place_name               0
place_description        0
category                 0
city                     0
price                    0
rating                   0
description_location     0
place_img                0
gallery_photo_img1       0
gallery_photo_img2       2
gallery_photo_img3      77
place_map                0
dtype: int64


Unnamed: 0,place_id,place_name,place_description,category,city,price,rating,description_location,place_img,gallery_photo_img1,gallery_photo_img2,gallery_photo_img3,place_map
0,1,Taman Nasional Gunung Leuser,Taman Nasional Gunung Leuser adalah salah satu...,"Budaya,Taman Nasional",Aceh,25000.0,4.5,"Barisan mountain range, Aceh 24653",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://www.google.com/maps/search/Taman+Nasio...
1,2,Desa Wisata Munduk,Desa Wisata Munduk adalah sebuah desa di pegun...,Desa Wisata,Bali,10000.0,4.5,"Munduk, Banjar, Kabupaten Buleleng, Bali",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://goo.gl/maps/LyeJ2mAeFGysTE9v9
2,3,Desa Wisata Penglipuran,Desa Wisata Penglipuran adalah sebuah desa wis...,"Budaya,Desa Wisata",Bali,25000.0,4.8,"Jl. Penglipuran, Kubu, Kec. Bangli, Kabupaten ...",https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://storage.googleapis.com/travelee-capsto...,https://www.google.com/maps/search/Desa+Wisata...


## 4) Text Preprocessing — ringan (tanpa stemming)

In [7]:
STOPWORDS_ID = set(["ada", "adalah", "agar", "akan", "antara", "atau", "banyak", "beberapa", "belum", "berbagai", "bila", "bisa", "bukan", "dalam", "dan", "dapat", "dari", "dengan", "di", "hanya", "harus", "hingga", "ini", "itu", "jika", "juga", "kah", "kami", "kamu", "karena", "ke", "kemudian", "kepada", "lah", "lain", "lainnya", "lalu", "lebih", "masih", "mereka", "mungkin", "namun", "nya", "oleh", "pada", "para", "pernah", "pun", "saat", "saja", "sampai", "sangat", "sebagai", "sebuah", "seluruh", "semua", "serta", "setiap", "suatu", "sudah", "supaya", "tanpa", "tapi", "tentang", "tentu", "terhadap", "tiap", "untuk", "yaitu", "yakni", "yang"])
IMPORTANT_WORDS = set(["di", "ke", "dari", "untuk", "dengan", "yang"])

def preprocess_text(text: str) -> str:
    text = str(text).lower()
    # keep alnum + underscore as token
    tokens = re.findall(r"\w+", text, flags=re.UNICODE)
    filtered = [t for t in tokens if (t not in STOPWORDS_ID) or (t in IMPORTANT_WORDS)]
    return " ".join(filtered)

df["gabungan"] = (df["place_description"].fillna("") + " " + df["category"].fillna("") + " " + df["city"].fillna("")).apply(preprocess_text)

# quick sanity check
df[["place_name","category","city","gabungan"]].head(5)

Unnamed: 0,place_name,category,city,gabungan
0,Taman Nasional Gunung Leuser,"Budaya,Taman Nasional",Aceh,taman nasional gunung leuser salah satu dari e...
1,Desa Wisata Munduk,Desa Wisata,Bali,desa wisata munduk desa di pegunungan bali yan...
2,Desa Wisata Penglipuran,"Budaya,Desa Wisata",Bali,desa wisata penglipuran desa wisata yang terle...
3,Taman Nasional Bali Barat,"Taman Nasional,Cagar Alam",Bali,taman nasional bali barat kawasan konservasi a...
4,Bukit Jamur,Cagar Alam,Bandung,bukit jamur ciwidey satu dari sekian pesona wi...


## 5) Feature Extraction — TF‑IDF

In [8]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,1),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm="l2"
)
tfidf_matrix = vectorizer.fit_transform(df["gabungan"].fillna(""))
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6210 stored elements and shape (182, 795)>

## 6) Build NearestNeighbors (cosine, brute)

In [9]:
nbrs = NearestNeighbors(n_neighbors=50, metric="cosine", algorithm="brute")
nbrs.fit(tfidf_matrix)
print(nbrs)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=50)


## 7) Save Artifacts

In [10]:
from scipy.sparse import csr_matrix

items_cols = ["place_name","place_img","place_map","category","city","rating","price","gabungan"]
items = df[items_cols].copy()
items.to_csv(ART_DIR / "items.csv", index=False)

save_npz(ART_DIR / "tfidf_matrix.npz", csr_matrix(tfidf_matrix))
joblib.dump(vectorizer, ART_DIR / "vectorizer.joblib")
joblib.dump(nbrs, ART_DIR / "nbrs_cosine.joblib")

meta = {
    "created_at": dt.datetime.utcnow().isoformat() + "Z",
    "n_items": int(items.shape[0]),
    "n_features": int(tfidf_matrix.shape[1]),
    "vectorizer": "sklearn TfidfVectorizer",
    "neighbors": "sklearn NearestNeighbors(metric=cosine, algorithm=brute)",
}
with open(ART_DIR / "metadata.json","w") as f:
    json.dump(meta, f, indent=2)

print("[OK] Artifacts saved under ./artifacts")
list(ART_DIR.iterdir())

[OK] Artifacts saved under ./artifacts


  "created_at": dt.datetime.utcnow().isoformat() + "Z",


[PosixPath('artifacts/items.csv'),
 PosixPath('artifacts/tfidf_matrix.npz'),
 PosixPath('artifacts/metadata.json'),
 PosixPath('artifacts/nbrs_cosine.joblib'),
 PosixPath('artifacts/vectorizer.joblib')]

## 8) Fungsi Rekomendasi (tanpa login)

In [12]:
def recommend_place(
    input_text: str,
    keywords=None,
    category: str|None=None,
    city: str|None=None,
    max_price: float|None=None,
    top_n: int=10,
    threshold: float=0.3,
    diversify_per_category: int=3,
    serendipity_pct: int=20,  # 0..30
):
    if keywords is None:
        keywords = []
    processed_input_text = preprocess_text(input_text)
    input_tfidf = vectorizer.transform([processed_input_text])

    # explicit filters
    filtered_idx = np.arange(items.shape[0])
    mask = np.full(items.shape[0], True, dtype=bool)

    if category:
        mask &= items["category"].str.contains(category, case=False, na=False).values
    if city:
        mask &= items["city"].str.contains(city, case=False, na=False).values
    if max_price is not None:
        price_col = items["price"].fillna(np.inf).values.astype(float)
        mask &= price_col <= float(max_price)

    filtered_idx = filtered_idx[mask]
    if filtered_idx.size == 0:
        return []

    # keyword match count
    all_keywords = [processed_input_text] + [preprocess_text(k) for k in keywords]
    def kw_count(s):
        s = str(s)
        return sum(1 for kw in all_keywords if kw and kw in s)

    kcounts = items.loc[filtered_idx, "gabungan"].apply(kw_count).values

    # restrict matrix
    sub_mat = tfidf_matrix[filtered_idx]
    sub_nbrs = NearestNeighbors(n_neighbors=min(top_n*4, sub_mat.shape[0]), metric="cosine", algorithm="brute")
    sub_nbrs.fit(sub_mat)

    distances, indices = sub_nbrs.kneighbors(input_tfidf)
    cand = []
    seen_cat = {}

    for rank in range(indices.shape[1]):
        i_local = indices[0, rank]
        i_global = filtered_idx[i_local]
        sim = 1.0 - float(distances[0, rank])
        if sim < threshold:
            continue
        row = items.iloc[i_global]
        cat_main = str(row["category"]).split(",")[0].strip()
        if diversify_per_category and seen_cat.get(cat_main,0) >= diversify_per_category:
            continue
        seen_cat[cat_main] = seen_cat.get(cat_main,0) + 1
        score = sim + 0.2 * float(kcounts[i_local])
        cand.append((i_global, score))

    # relax threshold once if needed
    if len(cand) < top_n and threshold > 0.0:
        for rank in range(indices.shape[1]):
            i_local = indices[0, rank]
            i_global = filtered_idx[i_local]
            sim = 1.0 - float(distances[0, rank])
            if sim < max(0.0, threshold-0.1):
                continue
            row = items.iloc[i_global]
            cat_main = str(row["category"]).split(",")[0].strip()
            if diversify_per_category and seen_cat.get(cat_main,0) >= diversify_per_category:
                continue
            seen_cat[cat_main] = seen_cat.get(cat_main,0) + 1
            score = sim + 0.2 * float(kcounts[i_local])
            cand.append((i_global, score))

    # unique by index
    uniq = {}
    for idx, sc in cand:
        uniq.setdefault(idx, sc)
        if sc > uniq[idx]: uniq[idx] = sc

    # serendipity
    selected_idx = set(uniq.keys())
    ser_pct = max(0, min(int(serendipity_pct), 30))
    n_ser = max(0, min(max(1, top_n//5), int(len(filtered_idx)*ser_pct/100)))
    if n_ser > 0:
        pool = [i for i in filtered_idx if i not in selected_idx]
        if pool:
            top_pop = items.iloc[pool].copy()
            pool_sorted = list(top_pop.sort_values(["rating"], ascending=False).index)
            random.shuffle(pool_sorted)
            for i in pool_sorted[:n_ser]:
                uniq.setdefault(i, 0.0)

    # sort & take top_n
    pairs = sorted(uniq.items(), key=lambda x:x[1], reverse=True)[:top_n]

    results = []
    for i, sc in pairs:
        row = items.iloc[int(i)]
        results.append({
            "place_name": row["place_name"],
            "place_img": row["place_img"],
            "place_map": row["place_map"],
            "category": row["category"],
            "city": row["city"],
            "rating": float(row["rating"]) if not pd.isna(row["rating"]) else None,
            "price": float(row["price"]) if not pd.isna(row["price"]) else None,
            "combined_score": float(sc),
        })
    return results


demo = recommend_place("pantai aceh", keywords=["pantai","laut"], top_n=5, threshold=0.3)
demo[:2]

[{'place_name': 'Pantai Indrayanti ',
  'place_img': 'https://storage.googleapis.com/travelee-capstone-projects.appspot.com/places/156.%20Pantai%20Indrayanti/main/20160709111429-transformed.jpeg',
  'place_map': 'https://www.google.com/maps/search/Pantai+Indrayanti+',
  'category': 'Bahari,Cagar Alam',
  'city': 'Yogyakarta',
  'rating': 4.5,
  'price': 10000.0,
  'combined_score': 0.7634276573233735},
 {'place_name': 'Pantai Timang',
  'place_img': 'https://storage.googleapis.com/travelee-capstone-projects.appspot.com/places/170.%20Pantai%20Timang/main/681.jpg',
  'place_map': 'https://www.google.com/maps/search/Pantai+Timang',
  'category': 'Bahari,Cagar Alam',
  'city': 'Yogyakarta',
  'rating': 4.5,
  'price': 10000.0,
  'combined_score': 0.7599838072425308}]

## 9) Selesai
Artefak siap dipakai oleh aplikasi **Streamlit** (`app.py`).