In [7]:
import pandas as pd
import numpy as np
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [3]:
place_df = pd.read_csv('tourism_place.csv')
user_df = pd.read_csv('user.csv')

In [4]:
# Drop 'Time_Minutes'
place_df.drop(columns=["Time_Minutes"], inplace=True)

# Add 'Tags' to place_df (initially empty or based on simple extraction from Category/Description)
place_df["Tags"] = ""

# Add 'Visited_Places' and 'Travel_Preference' to user_df
user_df["Visited_Places"] = [[] for _ in range(len(user_df))]
user_df["Travel_Preference"] = [[] for _ in range(len(user_df))]

In [5]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Place_Id     437 non-null    int64  
 1   Place_Name   437 non-null    object 
 2   Description  437 non-null    object 
 3   Category     437 non-null    object 
 4   City         437 non-null    object 
 5   Price        437 non-null    int64  
 6   Rating       437 non-null    float64
 7   Coordinate   437 non-null    object 
 8   Lat          437 non-null    float64
 9   Long         437 non-null    object 
 10  Tags         437 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 37.7+ KB


In [6]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   User_Id            300 non-null    int64 
 1   Location           300 non-null    object
 2   Age                300 non-null    int64 
 3   Visited_Places     300 non-null    object
 4   Travel_Preference  300 non-null    object
dtypes: int64(2), object(3)
memory usage: 11.8+ KB


In [8]:
# Initialize stemmer and stopword remover
stemmer = StemmerFactory().create_stemmer()
stop_factory = StopWordRemoverFactory()
stop_words = set(stop_factory.get_stop_words())

# Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = stemmer.stem(text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply to description
place_df["Clean_Description"] = place_df["Description"].astype(str).apply(clean_text)

KeyboardInterrupt: 

In [None]:
# Initialize TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(place_df["Clean_Description"])

# Map feature names (words)
feature_names = tfidf.get_feature_names_out()

# Function to extract top N keywords (tags)
def extract_tags(row_index, top_n=3):
    row = tfidf_matrix[row_index]
    scores = row.toarray().flatten()
    top_indices = scores.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices if scores[i] > 0]

# Apply to all descriptions
place_df["Tags"] = [extract_tags(i) for i in range(len(place_df))]