In [1]:
import pandas as pd
from datetime import datetime
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib # Untuk menyimpan/memuat model
import numpy as np # Untuk operasi array seperti argsort



In [2]:
# Untuk TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


2025-06-03 23:29:27.872057: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748968167.885461  166633 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748968167.889397  166633 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748968167.899315  166633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748968167.899329  166633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748968167.899330  166633 computation_placer.cc:177] computation placer alr

In [3]:
# --- Inisialisasi NLTK Stopwords dan Sastrawi Stemmer ---
# Pastikan Anda telah mendownload stopwords. Jika belum, uncomment baris di bawah dan jalankan sekali:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words_id = set(stopwords.words('indonesian'))

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

print("Memulai proses persiapan data dan pelatihan model...")

Memulai proses persiapan data dan pelatihan model...


[nltk_data] Downloading package stopwords to /home/crxtan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# --- Fungsi Pra-pemrosesan Teks ---
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words_id]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Fungsi Standarisasi Nama untuk Penggabungan
def standardize_name_for_merge(name):
    if pd.isna(name):
        return ''
    name = str(name).lower()
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


In [5]:
# --- BAGIAN 1: MUAT DAN PRA-PEMROSESAN DATASET ---

# 1.1 Muat Dataset Pertama (Cleaned_data.csv) menjadi df
try:
    df = pd.read_csv('Downloads/Cleaned_data.csv')
    print("\nDataset 'Cleaned_data.csv' berhasil dimuat ke df.")

    # Penanganan nilai hilang di kolom teks yang akan digabungkan
    df['name'].fillna('', inplace=True)
    df['description'].fillna('', inplace=True)
    df['review_keywords'].fillna('', inplace=True)
    df['address'].fillna('', inplace=True) # Untuk standarisasi place_name gabungan

    # Buat kolom 'content' gabungan awal untuk Content-Based Filtering
    df['content'] = df['name'] + ' ' + \
                            df['description'] + ' ' + \
                            df['review_keywords']
    df['processed_content'] = df['content'].apply(preprocess_text)

    print("Kolom konten awal untuk df berhasil diproses.")

except FileNotFoundError:
    print("Error: Pastikan file 'Cleaned_data.csv' ada di direktori yang sama.")
    exit()
except Exception as e:
    print(f"Terjadi kesalahan saat memproses 'Cleaned_data.csv': {e}")
    exit()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['name'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves


Dataset 'Cleaned_data.csv' berhasil dimuat ke df.
Kolom konten awal untuk df berhasil diproses.


In [6]:
# 1.2 Muat Dataset Kedua (SentimentReview.csv) menjadi df_review
try:
    df_review = pd.read_csv('Downloads/SentimentReview.csv')
    print("\nDataset 'SentimentReview.csv' berhasil dimuat ke df_review.")

    # Konversi 'published_at_date' ke datetime
    df_review['published_at_datetime'] = pd.to_datetime(df_review['published_at_date'])

    # Pastikan 'is_local_guide' menjadi integer (0/1)
    df_review['is_local_guide'] = df_review['is_local_guide'].fillna(0).astype(int)

    # Isi NaN pada 'text_akhir' dengan string kosong (teks yang sudah diproses)
    df_review['text_akhir'].fillna('', inplace=True)

    print("Kolom tanggal, is_local_guide, dan text_akhir di df_review berhasil diproses.")

except FileNotFoundError:
    print("Error: Pastikan file 'SentimentReview.csv' ada di direktori yang sama.")
    exit()
except Exception as e:
    print(f"Terjadi kesalahan saat memproses 'SentimentReview.csv': {e}")
    exit()

print("\nKedua dataset telah dimuat dan diproses secara awal.")


Dataset 'SentimentReview.csv' berhasil dimuat ke df_review.
Kolom tanggal, is_local_guide, dan text_akhir di df_review berhasil diproses.

Kedua dataset telah dimuat dan diproses secara awal.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_review['text_akhir'].fillna('', inplace=True)


In [7]:
# --- BAGIAN 2: PENGGABUNGAN DATASET BERDASARKAN place_name ---

print("\nMelakukan standarisasi 'place_name' dan membuat kunci gabungan untuk merge...")


Melakukan standarisasi 'place_name' dan membuat kunci gabungan untuk merge...


In [8]:
# 2.1 Standarisasi di df (Dataset Pertama)
df['name_standardized'] = df['name'].apply(standardize_name_for_merge)
df['address_standardized'] = df['address'].apply(standardize_name_for_merge)
df['merge_key'] = df['name_standardized'] + '_' + df['address_standardized']

In [9]:
# 2.2 Standarisasi di df_review (Dataset Kedua)
df_review['place_name_standardized'] = df_review['place_name'].apply(standardize_name_for_merge)
df_review['merge_key'] = df_review['place_name_standardized'] # Kunci gabungan hanya place_name

print("Standarisasi nama dan pembuatan kunci gabungan selesai.")


Standarisasi nama dan pembuatan kunci gabungan selesai.


In [10]:
# 2.3 Agregasi data ulasan dari df_review per tempat wisata
df_reviews_agg = df_review.groupby('merge_key').agg(
    avg_review_rating=('rating', 'mean'),
    total_reviews_from_raw=('rating', 'count'),
    avg_polarity_score=('polarity_score', 'mean'),
    positive_reviews_count=('polarity', lambda x: (x == 'positive').sum()),
    negative_reviews_count=('polarity', lambda x: (x == 'negative').sum()),
    all_review_texts=('text_akhir', lambda x: ' '.join(x.dropna().astype(str)))
).reset_index()

print("Dataset ulasan diagregasi per tempat wisata.")
print(df_reviews_agg.head())


Dataset ulasan diagregasi per tempat wisata.
                   merge_key  avg_review_rating  total_reviews_from_raw  \
0  101 nusa lima beach resto           4.084507                      71   
1         adhi pradana beach           4.428571                      14   
2             agal waterfall           4.818182                      44   
3     agro wisata dream land           3.412281                     114   
4          air terjun bintan           4.147239                     163   

   avg_polarity_score  positive_reviews_count  negative_reviews_count  \
0            1.436620                      53                      18   
1           -1.071429                       7                       7   
2           -1.363636                      31                      13   
3           -1.535088                      57                      57   
4           -0.822086                     104                      59   

                                    all_review_texts  
0  wisata 

In [11]:
# 2.4 Gabungkan df dengan df_reviews_agg
df_merged = pd.merge(df, df_reviews_agg, on='merge_key', how='left')

print("\nPenggabungan dataset selesai.")
print("Informasi df_merged setelah penggabungan:")
df_merged.info()


Penggabungan dataset selesai.
Informasi df_merged setelah penggabungan:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   place_id                2392 non-null   object 
 1   name                    2392 non-null   object 
 2   description             2392 non-null   object 
 3   reviews                 2392 non-null   int64  
 4   rating                  2392 non-null   float64
 5   featured_image          2391 non-null   object 
 6   address                 2392 non-null   object 
 7   review_keywords         2392 non-null   object 
 8   link                    2392 non-null   object 
 9   coordinates             2392 non-null   object 
 10  content                 2392 non-null   object 
 11  processed_content       2392 non-null   object 
 12  name_standardized       2392 non-null   object 
 13  address_standardized

In [12]:
# 2.5 Finalisasi Fitur Konten untuk Content-Based Model
# Isi NaN dari kolom baru setelah merge
df_merged['avg_review_rating'].fillna(df_merged['avg_review_rating'].mean(), inplace=True)
df_merged['total_reviews_from_raw'].fillna(0, inplace=True)
df_merged['avg_polarity_score'].fillna(0, inplace=True)
df_merged['positive_reviews_count'].fillna(0, inplace=True)
df_merged['negative_reviews_count'].fillna(0, inplace=True)
df_merged['all_review_texts'].fillna('', inplace=True)

# Perbarui 'final_processed_content' (teks dari df + teks dari ulasan yang sudah bersih)
df_merged['final_processed_content'] = df_merged['processed_content'] + ' ' + df_merged['all_review_texts']

print("\nFitur konten akhir (final_processed_content) untuk Content-Based model siap.")



Fitur konten akhir (final_processed_content) untuk Content-Based model siap.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['avg_review_rating'].fillna(df_merged['avg_review_rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['total_reviews_from_raw'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becau

In [13]:
# --- BAGIAN 3: PELATIHAN DAN PENYIMPANAN MODEL CONTENT-BASED ---

print("\nMelatih dan menyimpan model Content-Based...")

# 3.1 Inisialisasi dan latih TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.85)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_merged['final_processed_content'])



Melatih dan menyimpan model Content-Based...


In [14]:
# 3.2 Hitung Cosine Similarity Matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [15]:
# 3.3 Buat mapping place_id ke index
indices = pd.Series(df_merged.index, index=df_merged['place_id']).drop_duplicates()


In [16]:
# 3.4 Simpan Model Content-Based dan metadata yang diperlukan
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(cosine_sim_matrix, 'cosine_sim_matrix.pkl')
joblib.dump(indices, 'place_indices_map.pkl')

['place_indices_map.pkl']

In [17]:
# --- PERBAIKAN DI SINI: HAPUS 'polarity' dari daftar kolom untuk df_places_metadata ---
df_merged_metadata = df_merged[['place_id', 'name', 'description', 'reviews', 'rating', 'featured_image',
                                'address', 'review_keywords', 'link', 'coordinates', 'avg_review_rating',
                                'total_reviews_from_raw', 'avg_polarity_score',
                                'positive_reviews_count', 'negative_reviews_count', 'final_processed_content', 'merge_key']]
df_merged_metadata.to_pickle('df_places_metadata.pkl')

print("Model Content-Based (TF-IDF Vectorizer, Cosine Similarity Matrix) dan metadata tempat wisata telah disimpan.")


Model Content-Based (TF-IDF Vectorizer, Cosine Similarity Matrix) dan metadata tempat wisata telah disimpan.


In [18]:
# --- BAGIAN 4: PERSIAPAN DAN PELATIHAN MODEL COLLABORATIVE FILTERING (TensorFlow Matrix Factorization) ---

print("\nMenyiapkan dan melatih model Collaborative Filtering (TensorFlow Matrix Factorization)...")

# 4.1 Siapkan data untuk TensorFlow
cf_data_tf = df_review[['user_id', 'merge_key', 'rating']].copy()
cf_data_tf.dropna(subset=['rating'], inplace=True)

# Map user_id dan item_id (merge_key) ke integer kontigu
user_ids = cf_data_tf['user_id'].unique()
item_ids = cf_data_tf['merge_key'].unique()

num_users = len(user_ids)
num_items = len(item_ids)

user_id_mapping = {id: i for i, id in enumerate(user_ids)}
item_id_mapping = {id: i for i, id in enumerate(item_ids)}

# Buat DataFrame dengan ID integer yang dipetakan
cf_data_tf['user_encoded'] = cf_data_tf['user_id'].map(user_id_mapping)
cf_data_tf['item_encoded'] = cf_data_tf['merge_key'].map(item_id_mapping)

# Pisahkan data menjadi fitur (input) dan target (rating)
x = cf_data_tf[['user_encoded', 'item_encoded']].values
y = cf_data_tf['rating'].values

# Pisahkan data menjadi training dan testing set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)



Menyiapkan dan melatih model Collaborative Filtering (TensorFlow Matrix Factorization)...


In [19]:
# 4.2 Definisikan Model Matrix Factorization dengan Keras
embedding_size = 50 # Ukuran embedding (bisa disesuaikan)

user_input = keras.Input(shape=(1,), name='user_id')
item_input = keras.Input(shape=(1,), name='item_id')

# Embedding layer untuk user
user_embedding = layers.Embedding(
    input_dim=num_users,
    output_dim=embedding_size,
    input_length=1,
    name='user_embedding'
)(user_input)
user_vec = layers.Flatten(name='user_vector')(user_embedding)

# Embedding layer untuk item
item_embedding = layers.Embedding(
    input_dim=num_items,
    output_dim=embedding_size,
    input_length=1,
    name='item_embedding'
)(item_input)
item_vec = layers.Flatten(name='item_vector')(item_embedding)

# Dot product dari embedding user dan item untuk prediksi rating
dot_product = layers.Dot(axes=1, name='dot_product')([user_vec, item_vec])

# Optional: Tambahkan bias untuk user dan item (membantu akurasi)
user_bias = layers.Embedding(num_users, 1, name='user_bias')(user_input)
item_bias = layers.Embedding(num_items, 1, name='item_bias')(item_input)
dot_product_with_bias = layers.Add()([dot_product, layers.Flatten()(user_bias), layers.Flatten()(item_bias)])

# Output layer (aktivasi sigmoid untuk skala rating 1-5, lalu scaling)
# Jika rating asli 1-5, sigmoid output 0-1, maka dikalikan (MAX_RATING - MIN_RATING) dan ditambah MIN_RATING
min_rating = 1.0 # Sesuaikan dengan rating minimum Anda
max_rating = 5.0 # Sesuaikan dengan rating maksimum Anda

output = layers.Activation('sigmoid')(dot_product_with_bias) # Output range 0-1
output = layers.Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(output) # Scale to min_rating-max_rating

model_cf = keras.Model(inputs=[user_input, item_input], outputs=output)

# Kompilasi model (gunakan Mean Squared Error sebagai loss untuk rating)
model_cf.compile(optimizer='adam', loss='mean_squared_error')

I0000 00:00:1748968521.005213  166633 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8868 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


In [20]:
# 4.3 Latih Model
history = model_cf.fit(
    [x_train[:, 0], x_train[:, 1]], # Input user_encoded, item_encoded
    y_train,
    batch_size=64,
    epochs=10, # Sesuaikan jumlah epoch
    validation_data=([x_val[:, 0], x_val[:, 1]], y_val),
    verbose=1
)

# 4.4 Simpan Model TensorFlow
model_cf.save('cf_tf_matrix_factorization_model.h5')

# Simpan juga mapping ID
joblib.dump(user_id_mapping, 'user_id_mapping.pkl')
joblib.dump(item_id_mapping, 'item_id_mapping.pkl')
joblib.dump(user_ids, 'unique_user_ids.pkl') # Simpan daftar user ID asli
joblib.dump(item_ids, 'unique_item_ids.pkl') # Simpan daftar item ID asli (merge_key)

print("Model Collaborative Filtering (TensorFlow) telah disimpan.")

Epoch 1/10


I0000 00:00:1748968536.900706  169800 service.cc:152] XLA service 0x7c5f64002280 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748968536.900722  169800 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-06-03 23:35:36.928375: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1748968537.009292  169800 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  60/3762[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 3ms/step - loss: 2.8264

I0000 00:00:1748968537.308231  169800 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 2.4676 - val_loss: 1.6241
Epoch 2/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 1.1895 - val_loss: 1.2354
Epoch 3/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.3372 - val_loss: 1.1609
Epoch 4/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0917 - val_loss: 1.1679
Epoch 5/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0624 - val_loss: 1.0891
Epoch 6/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0459 - val_loss: 1.1404
Epoch 7/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0369 - val_loss: 1.0911
Epoch 8/10
[1m3762/3762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0343 - val_loss: 1.1513
Epoch 9/10
[1m3762/3762[0



Model Collaborative Filtering (TensorFlow) telah disimpan.


In [1]:
import pandas as pd
from datetime import datetime
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib # Untuk menyimpan/memuat model
import numpy as np # Untuk operasi array seperti argsort

# Untuk TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# --- Inisialisasi NLTK Stopwords dan Sastrawi Stemmer ---
# Pastikan Anda telah mendownload stopwords. Jika belum, uncomment baris di bawah dan jalankan sekali:
# import nltk
# import ssl
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words_id = set(stopwords.words('indonesian'))

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

print("Memulai proses persiapan data dan pelatihan model...")

# --- Fungsi Pra-pemrosesan Teks ---
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words_id]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Fungsi Standarisasi Nama untuk Penggabungan
def standardize_name_for_merge(name):
    if pd.isna(name):
        return ''
    name = str(name).lower()
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# --- BAGIAN 1: MUAT DAN PRA-PEMROSESAN DATASET ---

# 1.1 Muat Dataset Pertama (Cleaned_data.csv) menjadi df
try:
    df = pd.read_csv('Cleaned_data.csv')
    print("\nDataset 'Cleaned_data.csv' berhasil dimuat ke df.")

    # Penanganan nilai hilang di kolom teks yang akan digabungkan
    df['name'].fillna('', inplace=True)
    df['description'].fillna('', inplace=True)
    df['review_keywords'].fillna('', inplace=True)
    df['address'].fillna('', inplace=True) # Untuk standarisasi place_name gabungan

    # Buat kolom 'content' gabungan awal untuk Content-Based Filtering
    df['content'] = df['name'] + ' ' + \
                            df['description'] + ' ' + \
                            df['review_keywords']
    df['processed_content'] = df['content'].apply(preprocess_text)

    print("Kolom konten awal untuk df berhasil diproses.")

except FileNotFoundError:
    print("Error: Pastikan file 'Cleaned_data.csv' ada di direktori yang sama.")
    exit()
except Exception as e:
    print(f"Terjadi kesalahan saat memproses 'Cleaned_data.csv': {e}")
    exit()

# 1.2 Muat Dataset Kedua (SentimentReview.csv) menjadi df_review
try:
    df_review = pd.read_csv('SentimentReview.csv')
    print("\nDataset 'SentimentReview.csv' berhasil dimuat ke df_review.")

    # Konversi 'published_at_date' ke datetime
    df_review['published_at_datetime'] = pd.to_datetime(df_review['published_at_date'])

    # Pastikan 'is_local_guide' menjadi integer (0/1)
    df_review['is_local_guide'] = df_review['is_local_guide'].fillna(0).astype(int)

    # Isi NaN pada 'text_akhir' dengan string kosong (teks yang sudah diproses)
    df_review['text_akhir'].fillna('', inplace=True)

    print("Kolom tanggal, is_local_guide, dan text_akhir di df_review berhasil diproses.")

except FileNotFoundError:
    print("Error: Pastikan file 'SentimentReview.csv' ada di direktori yang sama.")
    exit()
except Exception as e:
    print(f"Terjadi kesalahan saat memproses 'SentimentReview.csv': {e}")
    exit()

print("\nKedua dataset telah dimuat dan diproses secara awal.")

# --- BAGIAN 2: PENGGABUNGAN DATASET BERDASARKAN place_name ---

print("\nMelakukan standarisasi 'place_name' dan membuat kunci gabungan untuk merge...")

# 2.1 Standarisasi di df (Dataset Pertama)
df['name_standardized'] = df['name'].apply(standardize_name_for_merge)
df['address_standardized'] = df['address'].apply(standardize_name_for_merge)
df['merge_key'] = df['name_standardized'] + '_' + df['address_standardized']

# 2.2 Standarisasi di df_review (Dataset Kedua)
df_review['place_name_standardized'] = df_review['place_name'].apply(standardize_name_for_merge)
df_review['merge_key'] = df_review['place_name_standardized'] # Kunci gabungan hanya place_name

print("Standarisasi nama dan pembuatan kunci gabungan selesai.")

# 2.3 Agregasi data ulasan dari df_review per tempat wisata
df_reviews_agg = df_review.groupby('merge_key').agg(
    avg_review_rating=('rating', 'mean'),
    total_reviews_from_raw=('rating', 'count'),
    avg_polarity_score=('polarity_score', 'mean'),
    positive_reviews_count=('polarity', lambda x: (x == 'positive').sum()),
    negative_reviews_count=('polarity', lambda x: (x == 'negative').sum()),
    all_review_texts=('text_akhir', lambda x: ' '.join(x.dropna().astype(str)))
).reset_index()

print("Dataset ulasan diagregasi per tempat wisata.")
print(df_reviews_agg.head())

# 2.4 Gabungkan df dengan df_reviews_agg
df_merged = pd.merge(df, df_reviews_agg, on='merge_key', how='left')

print("\nPenggabungan dataset selesai.")
print("Informasi df_merged setelah penggabungan:")
df_merged.info()

# 2.5 Finalisasi Fitur Konten untuk Content-Based Model
# Isi NaN dari kolom baru setelah merge
df_merged['avg_review_rating'].fillna(df_merged['avg_review_rating'].mean(), inplace=True)
df_merged['total_reviews_from_raw'].fillna(0, inplace=True)
df_merged['avg_polarity_score'].fillna(0, inplace=True)
df_merged['positive_reviews_count'].fillna(0, inplace=True)
df_merged['negative_reviews_count'].fillna(0, inplace=True)
df_merged['all_review_texts'].fillna('', inplace=True)

# Perbarui 'final_processed_content' (teks dari df + teks dari ulasan yang sudah bersih)
df_merged['final_processed_content'] = df_merged['processed_content'] + ' ' + df_merged['all_review_texts']

print("\nFitur konten akhir (final_processed_content) untuk Content-Based model siap.")

# --- BAGIAN 3: PELATIHAN DAN PENYIMPANAN MODEL CONTENT-BASED ---

print("\nMelatih dan menyimpan model Content-Based...")

# 3.1 Inisialisasi dan latih TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.85)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_merged['final_processed_content'])

# 3.2 Hitung Cosine Similarity Matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 3.3 Buat mapping place_id ke index
indices = pd.Series(df_merged.index, index=df_merged['place_id']).drop_duplicates()

# 3.4 Simpan Model Content-Based dan metadata yang diperlukan
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(cosine_sim_matrix, 'cosine_sim_matrix.pkl')
joblib.dump(indices, 'place_indices_map.pkl')

# --- PERBAIKAN DI SINI: HAPUS 'polarity' dari daftar kolom untuk df_places_metadata ---
df_merged_metadata = df_merged[['place_id', 'name', 'description', 'reviews', 'rating', 'featured_image',
                                'address', 'review_keywords', 'link', 'coordinates', 'avg_review_rating',
                                'total_reviews_from_raw', 'avg_polarity_score',
                                'positive_reviews_count', 'negative_reviews_count', 'final_processed_content', 'merge_key']]
df_merged_metadata.to_pickle('df_places_metadata.pkl')

print("Model Content-Based (TF-IDF Vectorizer, Cosine Similarity Matrix) dan metadata tempat wisata telah disimpan.")


# --- BAGIAN 4: PERSIAPAN DAN PELATIHAN MODEL COLLABORATIVE FILTERING (TensorFlow Matrix Factorization) ---

print("\nMenyiapkan dan melatih model Collaborative Filtering (TensorFlow Matrix Factorization)...")

# 4.1 Siapkan data untuk TensorFlow
cf_data_tf = df_review[['user_id', 'merge_key', 'rating']].copy()
cf_data_tf.dropna(subset=['rating'], inplace=True)

# Map user_id dan item_id (merge_key) ke integer kontigu
user_ids = cf_data_tf['user_id'].unique()
item_ids = cf_data_tf['merge_key'].unique()

num_users = len(user_ids)
num_items = len(item_ids)

user_id_mapping = {id: i for i, id in enumerate(user_ids)}
item_id_mapping = {id: i for i, id in enumerate(item_ids)}

# Buat DataFrame dengan ID integer yang dipetakan
cf_data_tf['user_encoded'] = cf_data_tf['user_id'].map(user_id_mapping)
cf_data_tf['item_encoded'] = cf_data_tf['merge_key'].map(item_id_mapping)

# Pisahkan data menjadi fitur (input) dan target (rating)
x = cf_data_tf[['user_encoded', 'item_encoded']].values
y = cf_data_tf['rating'].values

# Pisahkan data menjadi training dan testing set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# 4.2 Definisikan Model Matrix Factorization dengan Keras
embedding_size = 50 # Ukuran embedding (bisa disesuaikan)

user_input = keras.Input(shape=(1,), name='user_id')
item_input = keras.Input(shape=(1,), name='item_id')

# Embedding layer untuk user
user_embedding = layers.Embedding(
    input_dim=num_users,
    output_dim=embedding_size,
    input_length=1,
    name='user_embedding'
)(user_input)
user_vec = layers.Flatten(name='user_vector')(user_embedding)

# Embedding layer untuk item
item_embedding = layers.Embedding(
    input_dim=num_items,
    output_dim=embedding_size,
    input_length=1,
    name='item_embedding'
)(item_input)
item_vec = layers.Flatten(name='item_vector')(item_embedding)

# Dot product dari embedding user dan item untuk prediksi rating
dot_product = layers.Dot(axes=1, name='dot_product')([user_vec, item_vec])

# Optional: Tambahkan bias untuk user dan item (membantu akurasi)
user_bias = layers.Embedding(num_users, 1, name='user_bias')(user_input)
item_bias = layers.Embedding(num_items, 1, name='item_bias')(item_input)
dot_product_with_bias = layers.Add()([dot_product, layers.Flatten()(user_bias), layers.Flatten()(item_bias)])

# Output layer (aktivasi sigmoid untuk skala rating 1-5, lalu scaling)
# Jika rating asli 1-5, sigmoid output 0-1, maka dikalikan (MAX_RATING - MIN_RATING) dan ditambah MIN_RATING
min_rating = 1.0 # Sesuaikan dengan rating minimum Anda
max_rating = 5.0 # Sesuaikan dengan rating maksimum Anda

output = layers.Activation('sigmoid')(dot_product_with_bias) # Output range 0-1
output = layers.Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(output) # Scale to min_rating-max_rating

model_cf = keras.Model(inputs=[user_input, item_input], outputs=output)

# Kompilasi model (gunakan Mean Squared Error sebagai loss untuk rating)
model_cf.compile(optimizer='adam', loss='mean_squared_error')

# 4.3 Latih Model
history = model_cf.fit(
    [x_train[:, 0], x_train[:, 1]], # Input user_encoded, item_encoded
    y_train,
    batch_size=64,
    epochs=10, # Sesuaikan jumlah epoch
    validation_data=([x_val[:, 0], x_val[:, 1]], y_val),
    verbose=1
)

# 4.4 Simpan Model TensorFlow
model_cf.save('cf_tf_matrix_factorization_model.h5')

# Simpan juga mapping ID
joblib.dump(user_id_mapping, 'user_id_mapping.pkl')
joblib.dump(item_id_mapping, 'item_id_mapping.pkl')
joblib.dump(user_ids, 'unique_user_ids.pkl') # Simpan daftar user ID asli
joblib.dump(item_ids, 'unique_item_ids.pkl') # Simpan daftar item ID asli (merge_key)

print("Model Collaborative Filtering (TensorFlow) telah disimpan.")


# --- BAGIAN 5: MENYIAPKAN DATA TAMBAHAN UNTUK DEPLOYMENT ---

# 5.1 Siapkan DataFrame Interaksi Pengguna-Item Lengkap (untuk lookup di API)
df_user_interactions = df_review[['user_id', 'merge_key', 'rating', 'published_at_datetime', 'polarity_score', 'polarity']].copy()
df_user_interactions.rename(columns={'merge_key': 'item_id'}, inplace=True)
joblib.dump(df_user_interactions, 'df_user_interactions.pkl')
print("\nDataFrame interaksi pengguna-item (df_user_interactions) telah disimpan.")


# 5.2 Siapkan DataFrame Fitur Pengguna (untuk lookup profil pengguna di API)
df_user_features = df_review.groupby('user_id').agg(
    total_reviews_given=('rating', 'count'),
    total_photos_uploaded=('total_number_of_photos_by_reviewer', 'max'),
    is_local_guide=('is_local_guide', 'max'),
    avg_user_rating=('rating', 'mean'),
    avg_user_polarity_score=('polarity_score', 'mean')
).reset_index()
joblib.dump(df_user_features, 'df_user_features.pkl')
print("DataFrame fitur pengguna (df_user_features) telah disimpan.")


print("\n--- Proses Persiapan Data dan Pelatihan Model Selesai ---")
print("Anda sekarang memiliki file-file berikut yang siap untuk deployment:")
print("  - tfidf_vectorizer.pkl (Content-Based)")
print("  - cosine_sim_matrix.pkl (Content-Based)")
print("  - place_indices_map.pkl (Mapping ID tempat asli ke index DataFrame)")
print("  - df_places_metadata.pkl (Metadata tempat wisata yang diperkaya)")
print("  - cf_tf_matrix_factorization_model.h5 (Collaborative Filtering TensorFlow Model)")
print("  - user_id_mapping.pkl (Mapping ID pengguna asli ke ID integer model TF)")
print("  - item_id_mapping.pkl (Mapping ID item asli ke ID integer model TF)")
print("  - unique_user_ids.pkl (Daftar ID pengguna unik asli)")
print("  - unique_item_ids.pkl (Daftar ID item unik asli - merge_key)")
print("  - df_user_interactions.pkl (Riwayat interaksi pengguna)")
print("  - df_user_features.pkl (Profil fitur pengguna)")

print("\nUntuk deployment, Anda akan memuat file-file ini ke dalam backend web Anda.")

2025-06-03 23:13:14.378666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748967194.391513  156823 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748967194.395793  156823 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748967194.405558  156823 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748967194.405571  156823 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748967194.405572  156823 computation_placer.cc:177] computation placer alr

Memulai proses persiapan data dan pelatihan model...
Error: Pastikan file 'Cleaned_data.csv' ada di direktori yang sama.
Error: Pastikan file 'SentimentReview.csv' ada di direktori yang sama.

Kedua dataset telah dimuat dan diproses secara awal.

Melakukan standarisasi 'place_name' dan membuat kunci gabungan untuk merge...


NameError: name 'df' is not defined