In [15]:
import pandas as pd
import numpy as np
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import random

In [4]:
place_df = pd.read_csv('tourism_place.csv')
user_df = pd.read_csv('user.csv')

In [5]:
# Drop 'Time_Minutes'
place_df.drop(columns=["Time_Minutes"], inplace=True)

# Add 'Tags' to place_df (initially empty or based on simple extraction from Category/Description)
place_df["Tags"] = ""

# Add 'Visited_Places' and 'Travel_Preference' to user_df
user_df["Visited_Places"] = [[] for _ in range(len(user_df))]
user_df["Travel_Preference"] = [[] for _ in range(len(user_df))]

In [6]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Place_Id     437 non-null    int64  
 1   Place_Name   437 non-null    object 
 2   Description  437 non-null    object 
 3   Category     437 non-null    object 
 4   City         437 non-null    object 
 5   Price        437 non-null    int64  
 6   Rating       437 non-null    float64
 7   Coordinate   437 non-null    object 
 8   Lat          437 non-null    float64
 9   Long         437 non-null    object 
 10  Tags         437 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 37.7+ KB


In [7]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   User_Id            300 non-null    int64 
 1   Location           300 non-null    object
 2   Age                300 non-null    int64 
 3   Visited_Places     300 non-null    object
 4   Travel_Preference  300 non-null    object
dtypes: int64(2), object(3)
memory usage: 11.8+ KB


In [11]:
# Initialize stemmer and stopword remover
stemmer = StemmerFactory().create_stemmer()
stop_factory = StopWordRemoverFactory()
stop_words = set(stop_factory.get_stop_words())

# Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = stemmer.stem(text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply to description
place_df["Clean_Description"] = place_df["Description"].astype(str).apply(clean_text)

In [12]:
# Initialize TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(place_df["Clean_Description"])

# Map feature names (words)
feature_names = tfidf.get_feature_names_out()

# Function to extract top N keywords (tags)
def extract_tags(row_index, top_n=3):
    row = tfidf_matrix[row_index]
    scores = row.toarray().flatten()
    top_indices = scores.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices if scores[i] > 0]

# Apply to all descriptions
place_df["Tags"] = [extract_tags(i) for i in range(len(place_df))]

In [13]:
place_df.head(10)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Coordinate,Lat,Long,Tags,Clean_Description
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,"{'lat': -6.1753924, 'lng': 106.8271528}",-617.539,1.068.272,"[monumen, monas, tugu]",monumen nasional populer singkat monas tugu mo...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-613.764,1.068.171,"[alunalun, tua, mi]",kota tua jakarta nama kota tua pusat alunalun ...
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-612.531,1.068.335,"[dufan, fantasi, 1985]",dunia fantasi sebut dufan tempat hibur letak k...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-630.245,1.068.952,"[ls106, 68, 53]",taman mini indonesia indah rupa suatu kawasan ...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,"{'lat': -6.12419, 'lng': 106.839134}",-612.419,1.068.391,"[atlantis, tualang, ancol]",atlantis water adventure kenal atlantis ancol ...
5,6,Taman Impian Jaya Ancol,Taman Impian Jaya Ancol merupakan sebuah objek...,Taman Hiburan,Jakarta,25000,4.5,"{'lat': -6.117333200000001, 'lng': 106.8579951}",-611.733,106.858,"[utarann, impi, jaya]",taman impi jaya ancol rupa buah objek wisata j...
6,7,Kebun Binatang Ragunan,Kebun Binatang Ragunan adalah sebuah kebun bin...,Cagar Alam,Jakarta,4000,4.5,"{'lat': -6.3124593, 'lng': 106.8201865}",-631.246,1.068.202,"[binatang, ragunan, kebun]",kebun binatang ragunan buah kebun binatang let...
7,8,Ocean Ecopark,Ocean Ecopark Salah satu zona rekreasi Ancol y...,Taman Hiburan,Jakarta,180000,4.0,"{'lat': -6.125801699999999, 'lng': 106.8363249}",-61.258,1.068.363,"[ocean, harga, outbondholic]",ocean ecopark salah satu zona rekreasi ancol t...
8,9,Pelabuhan Marina,Pelabuhan Marina Ancol berada di kawasan Taman...,Bahari,Jakarta,175000,4.4,"{'lat': 1.07888, 'lng': 103.931398}",107.888,1.039.314,"[ancol, marina, labuh]",labuh marina ancol kawasan taman impi jaya anc...
9,10,Pulau Tidung,Pulau Tidung adalah salah satu kelurahan di ke...,Bahari,Jakarta,150000,4.5,"{'lat': -5.803205300000001, 'lng': 106.5237907}",-580.321,1.065.238,"[pulau, ribu, tidung]",pulau tidung salah satu lurah camat pulau ribu...


In [21]:
place_ids = place_df["Place_Id"].tolist()

def generate_visited():
    return random.sample(place_ids, k=random.randint(1, 5))

# Use unique categories from your dataset
categories = place_df["Category"].unique().tolist()

# Flatten the tag list and remove duplicates
all_tags = [tag for sublist in place_df["Tags"] for tag in sublist]
unique_tags = list(set(all_tags))

# Combine category and tag pools
combined_preferences = categories + unique_tags

# Generate user preferences
def generate_user_preferences(k=4):
    return random.sample(combined_preferences, k)


user_df["Travel_Preference"] = user_df["User_Id"].apply(lambda _: generate_user_preferences())
user_df["Visited_Places"] = user_df["User_Id"].apply(lambda _: generate_visited())

In [22]:
user_df.head(19)

Unnamed: 0,User_Id,Location,Age,Visited_Places,Travel_Preference
0,1,"Semarang, Jawa Tengah",20,"[379, 155, 372]","[pok, kreo, ngupasan, mata]"
1,2,"Bekasi, Jawa Barat",21,"[117, 102, 398, 106]","[pecinan, rajin, grote, masa]"
2,3,"Cirebon, Jawa Barat",23,"[170, 383, 101, 373, 130]","[binatang, teras, loket, drini]"
3,4,"Bekasi, Jawa Barat",21,"[315, 301, 316]","[honda, north, cibodas, kwan]"
4,5,"Lampung, Sumatera Selatan",20,"[356, 233]","[tebing, simulasi, glodok, geoforest]"
5,6,"Jakarta Utara, DKI Jakarta",18,"[110, 374]","[baya, la, nuarta, kerep]"
6,7,"Jakarta Selatan, DKI Jakarta",39,"[385, 381, 355, 51, 148]","[klinthing, kayu, tenun, resort]"
7,8,"Bandung, Jawa Barat",40,"[304, 185, 221]","[unit, mi, stone, of]"
8,9,"Surabaya, Jawa Timur",38,"[434, 328]","[im, gethuk, candi, pos]"
9,10,"Bekasi, Jawa Barat",39,"[236, 225, 49, 108]","[abang, ruang, ptt, digital]"


In [23]:
unique_categories = place_df["Category"].unique().tolist()
print(unique_categories)

['Budaya', 'Taman Hiburan', 'Cagar Alam', 'Bahari', 'Pusat Perbelanjaan', 'Tempat Ibadah']


In [24]:
user_df.to_csv('users_synthesized.csv')
place_df.to_csv('tourism_place_synthesized.csv')