In [38]:
import pandas as pd
import numpy as np
import nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import random
import ast

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fsl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [32]:
place_df = pd.read_csv('destinasi-wisata-indonesia.csv')
user_df = pd.read_csv('user.csv')

In [35]:
# Drop
place_df.drop(columns=["Tags"], inplace=True)

In [36]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Description   437 non-null    object 
 3   Category      437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    float64
 7   Coordinate    437 non-null    object 
 8   Rating_Count  437 non-null    int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 30.9+ KB


In [37]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User_Id   300 non-null    int64 
 1   Location  300 non-null    object
 2   Age       300 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.2+ KB


## Synthesize Place Data

In [40]:
place_df['Description'] = place_df['Description'].str.lower()  # Lowercase

In [43]:
# Indonesian stopwords
stopwords_indonesia = set(stopwords.words('indonesian'))

In [44]:
# Custom tokenizer for Indonesian
def tokenize_indonesian(text):
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords_indonesia]
    return tokens

# TF-IDF for keyword extraction
tfidf = TfidfVectorizer(
    tokenizer=tokenize_indonesian,  # Simple whitespace tokenizer
    max_features=50
)
tfidf_matrix = tfidf.fit_transform(place_df['Description'])
keywords = tfidf.get_feature_names_out()

# Assign top 3 keywords as tags
def extract_tags(description):
    words = description.split()
    words = [w for w in words if w in keywords]
    return ', '.join(list(set(words))[:5])

place_df['Tags'] = place_df['Description'].apply(extract_tags)

In [45]:
place_df.head(10)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Coordinate,Rating_Count,Tags
0,1,Monumen Nasional,monumen nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,14.2,"{'lat': -6.1753924, 'lng': 106.8271528}",18,"indonesia, monumen, tanggal, jakarta, terletak"
1,2,Kota Tua,"kota tua di jakarta, yang juga bernama kota tu...",Budaya,Jakarta,0,14.2,"{'lat': -6.137644799999999, 'lng': 106.8171245}",25,"bangunan, museum, jakarta, desa, kota"
2,3,Dunia Fantasi,dunia fantasi atau disebut juga dufan adalah t...,Taman Hiburan,Jakarta,270000,14.2,"{'lat': -6.125312399999999, 'lng': 106.8335377}",19,"indonesia., tanggal, kawasan, jakarta, terletak"
3,4,Taman Mini Indonesia Indah (TMII),taman mini indonesia indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,13.2,"{'lat': -6.302445899999999, 'lng': 106.8951559}",21,"indonesia, wisata, kawasan, jakarta, terletak"
4,5,Atlantis Water Adventure,atlantis water adventure atau dikenal dengan a...,Taman Hiburan,Jakarta,94000,13.2,"{'lat': -6.12419, 'lng': 106.839134}",24,"air, wisata, kawasan, luas, jakarta"
5,6,Taman Impian Jaya Ancol,taman impian jaya ancol merupakan sebuah objek...,Taman Hiburan,Jakarta,25000,13.2,"{'lat': -6.117333200000001, 'lng': 106.8579951}",24,"objek, taman, jakarta, wisata"
6,7,Kebun Binatang Ragunan,kebun binatang ragunan adalah sebuah kebun bin...,Cagar Alam,Jakarta,4000,13.2,"{'lat': -6.3124593, 'lng': 106.8201865}",22,"indonesia., daerah, pasar, kebun, jakarta"
7,8,Ocean Ecopark,ocean ecopark salah satu zona rekreasi ancol y...,Taman Hiburan,Jakarta,180000,4.0,"{'lat': -6.125801699999999, 'lng': 106.8363249}",30,"salah, rumah"
8,9,Pelabuhan Marina,pelabuhan marina ancol berada di kawasan taman...,Bahari,Jakarta,175000,12.2,"{'lat': 1.07888, 'lng': 103.931398}",19,"air, wisata, kawasan, pantai, pusat"
9,10,Pulau Tidung,pulau tidung adalah salah satu kelurahan di ke...,Bahari,Jakarta,150000,13.2,"{'lat': -5.803205300000001, 'lng': 106.5237907}",14,"kabupaten, kecamatan, salah, indonesia."


In [46]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Description   437 non-null    object 
 3   Category      437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    float64
 7   Coordinate    437 non-null    object 
 8   Rating_Count  437 non-null    int64  
 9   Tags          437 non-null    object 
dtypes: float64(1), int64(3), object(6)
memory usage: 34.3+ KB


## Synthesize User Data

In [22]:
user_df.head(19)

Unnamed: 0,User_Id,Location,Age,Visited_Places,Travel_Preference
0,1,"Semarang, Jawa Tengah",20,"[379, 155, 372]","[pok, kreo, ngupasan, mata]"
1,2,"Bekasi, Jawa Barat",21,"[117, 102, 398, 106]","[pecinan, rajin, grote, masa]"
2,3,"Cirebon, Jawa Barat",23,"[170, 383, 101, 373, 130]","[binatang, teras, loket, drini]"
3,4,"Bekasi, Jawa Barat",21,"[315, 301, 316]","[honda, north, cibodas, kwan]"
4,5,"Lampung, Sumatera Selatan",20,"[356, 233]","[tebing, simulasi, glodok, geoforest]"
5,6,"Jakarta Utara, DKI Jakarta",18,"[110, 374]","[baya, la, nuarta, kerep]"
6,7,"Jakarta Selatan, DKI Jakarta",39,"[385, 381, 355, 51, 148]","[klinthing, kayu, tenun, resort]"
7,8,"Bandung, Jawa Barat",40,"[304, 185, 221]","[unit, mi, stone, of]"
8,9,"Surabaya, Jawa Timur",38,"[434, 328]","[im, gethuk, candi, pos]"
9,10,"Bekasi, Jawa Barat",39,"[236, 225, 49, 108]","[abang, ruang, ptt, digital]"


In [23]:
unique_categories = place_df["Category"].unique().tolist()
print(unique_categories)

['Budaya', 'Taman Hiburan', 'Cagar Alam', 'Bahari', 'Pusat Perbelanjaan', 'Tempat Ibadah']


In [24]:
user_df.to_csv('users_synthesized.csv')
place_df.to_csv('tourism_place_synthesized.csv')