# Preprocessing Raw Datasets of Tourist Attractions and Users

## Import All Packages/Library

In [1]:
import os
import pandas as pd
from deep_translator import GoogleTranslator

## Data Preprocessing

#### **Step 1:** Displaying files in the "raw_data" folder and subfolders.
##### Objective: Identify the list of available datasets.

In [2]:
for filename in os.listdir("./raw_data"):
     print(filename)

dataset_pariwisata_yogyakarta
dataset_pariwisata_yogyakarta.zip
indonesia_tourism_destination
indonesia_tourism_destination.zip
yogyakarta_tourism_place
yogyakarta_tourism_place.zip


In [3]:
for filename in os.listdir("./raw_data/dataset_pariwisata_yogyakarta"):
     print(filename)

tour.csv
tour_rating.csv
user.csv


In [4]:
for filename in os.listdir("./raw_data/indonesia_tourism_destination"):
     print(filename)

package_tourism.csv
tourism_rating.csv
tourism_with_id.csv
user.csv


In [5]:
for filename in os.listdir("./raw_data/yogyakarta_tourism_place"):
     print(filename)

raw_data.csv


#### **Step 2:** Reading the data files.
##### Objective: Read the data files to ensure the data can be accessed in subsequent processes.

In [6]:
# Reading "dataset_pariwisata_yogyakarta"
## Reading "tour.csv", "tour_rating.csv", and "user.csv"
dpy_tourism = pd.read_csv("./raw_data/dataset_pariwisata_yogyakarta/tour.csv", delimiter=",")
dpy_rating = pd.read_csv("./raw_data/dataset_pariwisata_yogyakarta/tour_rating.csv", delimiter=",")
dpy_user = pd.read_csv("./raw_data/dataset_pariwisata_yogyakarta/user.csv", delimiter=",")

# Reading "indonesia_tourism_destination"
## Reading "package_tourism.csv", "tourism_rating.csv", "tourism_with_id", and "user.csv"
itd_package = pd.read_csv("./raw_data/indonesia_tourism_destination/package_tourism.csv", delimiter=",")
itd_rating = pd.read_csv("./raw_data/indonesia_tourism_destination/tourism_rating.csv", delimiter=",")
itd_tourism = pd.read_csv("./raw_data/indonesia_tourism_destination/tourism_with_id.csv", delimiter=",")
itd_user = pd.read_csv("./raw_data/indonesia_tourism_destination/user.csv", delimiter=",")

# Reading "yogyakarta_tourism_place"
## Reading "raw_data.csv"
ytp_data = pd.read_csv("./raw_data/yogyakarta_tourism_place/raw_data.csv", delimiter=",")

#### **Step 3:** Displaying a preview of the data files.
##### Objective: Understand the content of the data files to avoid errors in data usage during the analysis process.

Preview of **dataset_pariwisata_yogyakarta**

In [7]:
dpy_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Latitude,Longitude
0,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,120.0,"{'lat': -7.800671500000001, 'lng': 110.3676551}",-7.800671,110.367655
1,86.0,Keraton Yogyakarta,Keraton Ngayogyakarta Hadiningrat atau Keraton...,Budaya,Yogyakarta,15000,4.6,,"{'lat': -7.8052845, 'lng': 110.3642031}",-7.805284,110.364203
2,87.0,Sindu Kusuma Edupark (SKE),Sindu Kusuma Edupark (SKE) merupakan sebuah de...,Taman Hiburan,Yogyakarta,20000,4.2,120.0,"{'lat': -7.767297300000001, 'lng': 110.3542486}",-7.767297,110.354249
3,88.0,Museum Benteng Vredeburg Yogyakarta,Museum Benteng Vredeburg (bahasa Jawa: ???????...,Budaya,Yogyakarta,3000,4.6,120.0,"{'lat': -7.800201599999999, 'lng': 110.3663044}",-7.800202,110.366304
4,89.0,De Mata Museum Jogja,Museum De Mata merupakan salah satu museum yan...,Budaya,Yogyakarta,50000,4.4,,"{'lat': -7.816315599999999, 'lng': 110.3871442}",-7.816316,110.387144


In [8]:
dpy_rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,101,4
1,1,154,2
2,1,103,3
3,1,208,5
4,1,89,3


In [9]:
dpy_user.head()

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20.0
1,2,"Bekasi, Jawa Barat",21.0
2,3,"Cirebon, Jawa Barat",23.0
3,4,"Bekasi, Jawa Barat",21.0
4,5,"Lampung, Sumatera Selatan",20.0


Preview of **indonesia_tourism_destination**

In [10]:
itd_package.head()

Unnamed: 0,Package,City,Place_Tourism1,Place_Tourism2,Place_Tourism3,Place_Tourism4,Place_Tourism5
0,1,Jakarta,Pasar Tanah Abang,Taman Ayodya,Museum Tekstil,,
1,2,Jakarta,Pasar Tanah Abang,Pasar Taman Puring,Pasar Petak Sembilan,,
2,3,Jakarta,Perpustakaan Nasional,Monas,Masjid Istiqlal,,
3,4,Jakarta,Pulau Tidung,Pulau Bidadari,Pulau Pari,Pulau Pramuka,Pulau Pelangi
4,5,Jakarta,Museum Satria Mandala,Museum Wayang,Museum Bahari Jakarta,Museum Macan (Modern and Contemporary Art in N...,


In [11]:
itd_rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [12]:
itd_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [13]:
itd_user.head()

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20
1,2,"Bekasi, Jawa Barat",21
2,3,"Cirebon, Jawa Barat",23
3,4,"Bekasi, Jawa Barat",21
4,5,"Lampung, Sumatera Selatan",20


Preview of **yogyakarta_tourism_place**

In [14]:
ytp_data.head()

Unnamed: 0,no,nama,vote_average,vote_count,type,htm_weekday,htm_weekend,latitude,longitude,description
0,9,Candi Borobudur,4.7,81922,Budaya dan Sejarah,50000.0,50000.0,-7.607087,110.203623,Candi yang pernah masuk sebagai salah satu dar...
1,10,Candi Prambanan,4.7,71751,Budaya dan Sejarah,50000.0,50000.0,-7.751835,110.491532,Candi Prambanan adalah kompleks candi Hindu te...
2,24,Tebing Breksi,4.4,51431,Alam,10000.0,10000.0,-7.781477,110.504576,Tebing Breksi merupakan tempat wisata yang ber...
3,343,Gembira Loka Zoo,4.5,36337,Buatan,20000.0,25000.0,-7.806234,110.396798,Gambira Loka adalah kebun binatang yang berada...
4,346,The Palace of Yogyakarta (Keraton Yogyakarta),4.6,30091,Budaya dan Sejarah,8000.0,8000.0,-7.805284,110.364203,Kompleks keraton merupakan museum yang menyimp...


#### **Step 4:** Preprocessing each dataset.
##### Objective: Ensure each dataset is consistent and usable before merging.

**4.1. Data Frame *dataset_pariwisata_yogyakarta***

4.1.1. Diplaying required data.

In [15]:
# Selecting only the required columns from the tourism dataset
tourism_dpy = dpy_tourism[["Place_Id", "Place_Name", "Description", "Category", "Price", "Rating", "Latitude", "Longitude"]]

# Renaming columns to ensure consistent naming conventions
tourism_dpy.rename(columns={
    "Place_Id": "place_id",
    "Place_Name": "place_name", 
    "Description": "description", 
    "Category": "category", 
    "Price": "price", 
    "Rating": "place_rating",
    "Latitude": "latitude", 
    "Longitude": "longitude"
    },inplace=True)

tourism_dpy.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourism_dpy.rename(columns={


Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude
0,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655
1,86.0,Keraton Yogyakarta,Keraton Ngayogyakarta Hadiningrat atau Keraton...,Budaya,15000,4.6,-7.805284,110.364203
2,87.0,Sindu Kusuma Edupark (SKE),Sindu Kusuma Edupark (SKE) merupakan sebuah de...,Taman Hiburan,20000,4.2,-7.767297,110.354249
3,88.0,Museum Benteng Vredeburg Yogyakarta,Museum Benteng Vredeburg (bahasa Jawa: ???????...,Budaya,3000,4.6,-7.800202,110.366304
4,89.0,De Mata Museum Jogja,Museum De Mata merupakan salah satu museum yan...,Budaya,50000,4.4,-7.816316,110.387144


In [16]:
# Extracting user data, including their ratings for places
user_dpy = dpy_rating

# Renaming columns to ensure consistent naming conventions
user_dpy.rename(columns={
    "User_Id": "user_id", 
    "Place_Id": "place_id", 
    "Place_Ratings": "user_rating"
    },inplace=True)
user_dpy.head()

Unnamed: 0,user_id,place_id,user_rating
0,1,101,4
1,1,154,2
2,1,103,3
3,1,208,5
4,1,89,3


In [17]:
# Merging tourist attractions and users data frame
dpy_df = pd.merge(
    left = tourism_dpy,
    right = user_dpy,
    how = "inner",
    on = "place_id"
)
dpy_df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating
0,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,2,4
1,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,23,4
2,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,25,2
3,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,39,5
4,85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,43,4


4.1.2. Assessing and cleaning data.

In [18]:
# Displaying information about the dataset to check data types, non-null counts, and potential missing values.
dpy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 0 to 2869
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      2870 non-null   float64
 1   place_name    2870 non-null   object 
 2   description   2870 non-null   object 
 3   category      2870 non-null   object 
 4   price         2870 non-null   int64  
 5   place_rating  2870 non-null   float64
 6   latitude      2870 non-null   float64
 7   longitude     2870 non-null   float64
 8   user_id       2870 non-null   int64  
 9   user_rating   2870 non-null   int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 224.3+ KB


In [19]:
# Converting columns to appropriate data types for consistency across datasets.
dpy_df = dpy_df.astype({"place_id": "int64", "user_rating": "float64"})
dpy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 0 to 2869
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      2870 non-null   int64  
 1   place_name    2870 non-null   object 
 2   description   2870 non-null   object 
 3   category      2870 non-null   object 
 4   price         2870 non-null   int64  
 5   place_rating  2870 non-null   float64
 6   latitude      2870 non-null   float64
 7   longitude     2870 non-null   float64
 8   user_id       2870 non-null   int64  
 9   user_rating   2870 non-null   float64
dtypes: float64(4), int64(3), object(3)
memory usage: 224.3+ KB


In [20]:
# Checking summary statistics to identify potential inaccurate or outlier values.
dpy_df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0
mean,147.662021,19314.982578,4.465993,-7.891467,110.42458,152.132056,3.10453
std,36.950298,54103.679871,0.168172,0.166341,0.138182,86.489316,1.394187
min,85.0,0.0,4.0,-8.197894,110.019826,1.0,1.0
25%,115.0,2500.0,4.4,-8.017327,110.363751,78.0,2.0
50%,149.0,5000.0,4.5,-7.815963,110.418875,153.0,3.0
75%,179.0,10000.0,4.6,-7.79219,110.492909,227.0,4.0
max,210.0,500000.0,5.0,-7.58292,110.720854,300.0,5.0


In [21]:
# Subtracting 84 from place_id to restart from 1
dpy_df["place_id"] = (dpy_df["place_id"] - 84).astype("int64")
dpy_df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0
mean,63.662021,19314.982578,4.465993,-7.891467,110.42458,152.132056,3.10453
std,36.950298,54103.679871,0.168172,0.166341,0.138182,86.489316,1.394187
min,1.0,0.0,4.0,-8.197894,110.019826,1.0,1.0
25%,31.0,2500.0,4.4,-8.017327,110.363751,78.0,2.0
50%,65.0,5000.0,4.5,-7.815963,110.418875,153.0,3.0
75%,95.0,10000.0,4.6,-7.79219,110.492909,227.0,4.0
max,126.0,500000.0,5.0,-7.58292,110.720854,300.0,5.0


4.1.3. Insight
- The required columns for the model and application features are the place ID, name, description, category, price, average rating, latitude, and longitude, along with user data (ID and ratings).
- The final data frames containing the necessary information are derived from the original datasets.
- To ensure consistency, data types and values must be correct, so info() and describe() functions were used for validation.
- After reviewing the data information, I corrected the data types: int64 for IDs and price (since they should not contain decimals), object for text fields (name, description, category), and float64 for numerical fields like ratings, latitude, and longitude (since they are required for calculations).
- Additionally, after reviewing the data description, I adjusted the place ID values to start from 1 instead of 85, resulting the place ID range from 1-126. Meanwhile, the user ID range is 1-300.

**4.2. Data Frame *indonesia_tourism_destination***

4.2.1. Diplaying required data.

In [22]:
# Selecting only the required columns from the tourism dataset
tourism_itd = itd_tourism[["Place_Id", "Place_Name", "Description", "Category", "City", "Price", "Rating", "Lat", "Long"]]

# Renaming columns to ensure consistent naming conventions
tourism_itd.rename(columns={
    "Place_Id": "place_id",
    "Place_Name": "place_name", 
    "Description": "description", 
    "Category": "category", 
    "Price": "price", 
    "Rating": "place_rating",
    "Lat": "latitude", 
    "Long": "longitude"
    },inplace=True)

tourism_itd.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tourism_itd.rename(columns={


Unnamed: 0,place_id,place_name,description,category,City,price,place_rating,latitude,longitude
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,-6.137645,106.817125
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,-6.125312,106.833538
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,-6.302446,106.895156
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,-6.12419,106.839134


In [23]:
# Extracting user data, including their ratings for places
user_itd = itd_rating

# Renaming columns to ensure consistent naming conventions
user_itd.rename(columns={
    "User_Id": "user_id", 
    "Place_Id": "place_id", 
    "Place_Ratings": "user_rating"
    },inplace=True)
user_itd.head()

Unnamed: 0,user_id,place_id,user_rating
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [24]:
# Merging tourist attractions and users data frame
itd_df = pd.merge(
    left = tourism_itd,
    right = user_itd,
    how = "inner",
    on = "place_id"
)
itd_df.head()

Unnamed: 0,place_id,place_name,description,category,City,price,place_rating,latitude,longitude,user_id,user_rating
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153,36,4
1,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153,38,2
2,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153,64,2
3,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153,74,2
4,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,-6.175392,106.827153,86,4


4.2.2. Assessing and cleaning data.

In [25]:
# The required data is only from Daerah Istimewa Yogyakarta, so checking the available cities is necessary.
itd_unique_cities = itd_df["City"].unique()
itd_unique_cities

array(['Jakarta', 'Yogyakarta', 'Bandung', 'Semarang', 'Surabaya'],
      dtype=object)

In [26]:
# The only city from Daerah Istimewa Yogyakarta is Yogyakarta, so the data should be filtered to include only tourist attractions from Yogyakarta city.
itd_df_yk = itd_df[itd_df["City"] == "Yogyakarta"]
itd_df_yk.head()

Unnamed: 0,place_id,place_name,description,category,City,price,place_rating,latitude,longitude,user_id,user_rating
1920,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,-7.800671,110.367655,2,4
1921,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,-7.800671,110.367655,23,4
1922,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,-7.800671,110.367655,25,2
1923,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,-7.800671,110.367655,39,5
1924,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,-7.800671,110.367655,43,4


In [27]:
# Updating the columns to ensure consistency across datasets.
itd_df_yk = itd_df_yk[["place_id", "place_name", "description", "category", "price", "place_rating", "latitude", "longitude", "user_id", "user_rating"]]
itd_df_yk = itd_df_yk.reset_index(drop=True) # Restarting the index to start from 0.
itd_df_yk.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating
0,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,2,4
1,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,23,4
2,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,25,2
3,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,39,5
4,85,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,43,4


In [28]:
# Displaying information about the dataset to check data types, non-null counts, and potential missing values.
itd_df_yk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2871 entries, 0 to 2870
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      2871 non-null   int64  
 1   place_name    2871 non-null   object 
 2   description   2871 non-null   object 
 3   category      2871 non-null   object 
 4   price         2871 non-null   int64  
 5   place_rating  2871 non-null   float64
 6   latitude      2871 non-null   float64
 7   longitude     2871 non-null   float64
 8   user_id       2871 non-null   int64  
 9   user_rating   2871 non-null   int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 224.4+ KB


In [29]:
# Converting columns to appropriate data types for consistency across datasets.
itd_df_yk = itd_df_yk.astype({"user_rating": "float64"})
itd_df_yk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2871 entries, 0 to 2870
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      2871 non-null   int64  
 1   place_name    2871 non-null   object 
 2   description   2871 non-null   object 
 3   category      2871 non-null   object 
 4   price         2871 non-null   int64  
 5   place_rating  2871 non-null   float64
 6   latitude      2871 non-null   float64
 7   longitude     2871 non-null   float64
 8   user_id       2871 non-null   int64  
 9   user_rating   2871 non-null   float64
dtypes: float64(4), int64(3), object(3)
memory usage: 224.4+ KB


In [30]:
# Checking summary statistics to identify potential inaccurate or outlier values.
itd_df_yk.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,2871.0,2871.0,2871.0,2871.0,2871.0,2871.0,2871.0
mean,147.672936,19334.378265,4.46604,-7.891425,110.424602,152.079415,3.104493
std,36.94849,54104.235431,0.168162,0.166328,0.138163,86.520236,1.393946
min,85.0,0.0,4.0,-8.197894,110.019826,1.0,1.0
25%,115.0,2500.0,4.4,-8.017327,110.363751,78.0,2.0
50%,149.0,5000.0,4.5,-7.815963,110.418875,153.0,3.0
75%,179.0,10000.0,4.6,-7.79219,110.492909,227.0,4.0
max,210.0,500000.0,5.0,-7.58292,110.720854,300.0,5.0


In [31]:
# Since place and user data might come from different sources compared to the previous datasets, we treat the IDs individually. 
# Therefore, we need to restart the place and user IDs from the next number following the latest IDs from the previous datasets.

# Adding 42 to place_id to restart from 127.
itd_df_yk["place_id"] = (itd_df_yk["place_id"] + 42).astype("int64")
itd_df_yk.describe()

# Adding 300 to user_id to restart from 301.
itd_df_yk["user_id"] = (itd_df_yk["user_id"] + 300).astype("int64")
itd_df_yk.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,2871.0,2871.0,2871.0,2871.0,2871.0,2871.0,2871.0
mean,189.672936,19334.378265,4.46604,-7.891425,110.424602,452.079415,3.104493
std,36.94849,54104.235431,0.168162,0.166328,0.138163,86.520236,1.393946
min,127.0,0.0,4.0,-8.197894,110.019826,301.0,1.0
25%,157.0,2500.0,4.4,-8.017327,110.363751,378.0,2.0
50%,191.0,5000.0,4.5,-7.815963,110.418875,453.0,3.0
75%,221.0,10000.0,4.6,-7.79219,110.492909,527.0,4.0
max,252.0,500000.0,5.0,-7.58292,110.720854,600.0,5.0


4.2.3. Insight
- The required columns for the model and application features are the place ID, name, description, category, price, average rating, latitude, and longitude, along with user data (ID and ratings).
- The final data frames containing the necessary information are derived from the original datasets.
- Since this dataset includes various city data, I added the city information to the dataframe.
- During the data assessment and cleaning process, I filtered out cities other than "Yogyakarta," as only Yogyakarta-related data is needed.
- To ensure consistency, data types and values must be correct, so info() and describe() functions were used for validation.
- After reviewing the data information, I corrected the data types: int64 for IDs and price (since they should not contain decimals), object for text fields (name, description, category), and float64 for numerical fields like ratings, latitude, and longitude (since they are required for calculations).
- Additionally, after reviewing the data description, I adjusted the place ID values to start from 127 instead of 85, resulting the place ID range from 127-252. I also adjusted the user ID values to start from 301 instead of 1, resulting the user ID range from 301-600.

**4.3. Data Frame *yogyakarta_tourism_place***

4.3.1. Diplaying required data.

In [32]:
ytp_data.head()

Unnamed: 0,no,nama,vote_average,vote_count,type,htm_weekday,htm_weekend,latitude,longitude,description
0,9,Candi Borobudur,4.7,81922,Budaya dan Sejarah,50000.0,50000.0,-7.607087,110.203623,Candi yang pernah masuk sebagai salah satu dar...
1,10,Candi Prambanan,4.7,71751,Budaya dan Sejarah,50000.0,50000.0,-7.751835,110.491532,Candi Prambanan adalah kompleks candi Hindu te...
2,24,Tebing Breksi,4.4,51431,Alam,10000.0,10000.0,-7.781477,110.504576,Tebing Breksi merupakan tempat wisata yang ber...
3,343,Gembira Loka Zoo,4.5,36337,Buatan,20000.0,25000.0,-7.806234,110.396798,Gambira Loka adalah kebun binatang yang berada...
4,346,The Palace of Yogyakarta (Keraton Yogyakarta),4.6,30091,Budaya dan Sejarah,8000.0,8000.0,-7.805284,110.364203,Kompleks keraton merupakan museum yang menyimp...


In [33]:
# Selecting only the required columns from the tourism dataset
ytp_df = ytp_data[["no", "nama", "description", "type", "htm_weekday", "htm_weekend", "vote_average", "latitude", "longitude"]]
# This dataset doesn't have user data.

# Renaming columns to ensure consistent naming conventions
ytp_df.rename(columns={
    "no": "place_id",
    "nama": "place_name", 
    "type": "category",
    "htm_weekday": "price_1",
    "htm_weekend": "price_2",
    "vote_average": "place_rating",
    },inplace=True)

ytp_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ytp_df.rename(columns={


Unnamed: 0,place_id,place_name,description,category,price_1,price_2,place_rating,latitude,longitude
0,9,Candi Borobudur,Candi yang pernah masuk sebagai salah satu dar...,Budaya dan Sejarah,50000.0,50000.0,4.7,-7.607087,110.203623
1,10,Candi Prambanan,Candi Prambanan adalah kompleks candi Hindu te...,Budaya dan Sejarah,50000.0,50000.0,4.7,-7.751835,110.491532
2,24,Tebing Breksi,Tebing Breksi merupakan tempat wisata yang ber...,Alam,10000.0,10000.0,4.4,-7.781477,110.504576
3,343,Gembira Loka Zoo,Gambira Loka adalah kebun binatang yang berada...,Buatan,20000.0,25000.0,4.5,-7.806234,110.396798
4,346,The Palace of Yogyakarta (Keraton Yogyakarta),Kompleks keraton merupakan museum yang menyimp...,Budaya dan Sejarah,8000.0,8000.0,4.6,-7.805284,110.364203


4.3.2. Assessing and cleaning data.

In [34]:
# Only one final price value is needed for analysis.
# Calculating the mean of price_1 and price_2 to get the final price value.
ytp_df["price"] = ytp_df[["price_1", "price_2"]].mean(axis=1)
ytp_df = ytp_df[["place_id", "place_name", "description", "category", "price", "place_rating", "latitude", "longitude"]]
ytp_df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude
0,9,Candi Borobudur,Candi yang pernah masuk sebagai salah satu dar...,Budaya dan Sejarah,50000.0,4.7,-7.607087,110.203623
1,10,Candi Prambanan,Candi Prambanan adalah kompleks candi Hindu te...,Budaya dan Sejarah,50000.0,4.7,-7.751835,110.491532
2,24,Tebing Breksi,Tebing Breksi merupakan tempat wisata yang ber...,Alam,10000.0,4.4,-7.781477,110.504576
3,343,Gembira Loka Zoo,Gambira Loka adalah kebun binatang yang berada...,Buatan,22500.0,4.5,-7.806234,110.396798
4,346,The Palace of Yogyakarta (Keraton Yogyakarta),Kompleks keraton merupakan museum yang menyimp...,Budaya dan Sejarah,8000.0,4.6,-7.805284,110.364203


In [35]:
# Displaying information about the dataset to check data types, non-null counts, and potential missing values.
ytp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      476 non-null    int64  
 1   place_name    476 non-null    object 
 2   description   144 non-null    object 
 3   category      147 non-null    object 
 4   price         134 non-null    float64
 5   place_rating  476 non-null    float64
 6   latitude      473 non-null    float64
 7   longitude     473 non-null    float64
dtypes: float64(4), int64(1), object(3)
memory usage: 29.9+ KB


In [36]:
# Converting columns to appropriate data types for consistency across datasets.

# Filling missing values in the price column with 0 to handle potential NaN values and ensure no errors during type conversion.
# Note: Other missing values in the dataset will be addressed after merging datasets.
ytp_df["price"] = ytp_df["price"].fillna(0)
ytp_df = ytp_df.astype({"price": "int64"})

ytp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      476 non-null    int64  
 1   place_name    476 non-null    object 
 2   description   144 non-null    object 
 3   category      147 non-null    object 
 4   price         476 non-null    int64  
 5   place_rating  476 non-null    float64
 6   latitude      473 non-null    float64
 7   longitude     473 non-null    float64
dtypes: float64(3), int64(2), object(3)
memory usage: 29.9+ KB


In [37]:
# Checking summary statistics to identify potential inaccurate or outlier values.
ytp_df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude
count,476.0,476.0,476.0,473.0,473.0
mean,252.64916,6128.151261,4.445588,-7.822201,110.395391
std,146.79256,33377.085829,0.400055,0.139608,0.121214
min,1.0,0.0,1.0,-8.200347,109.913062
25%,125.75,0.0,4.3,-7.877766,110.353794
50%,252.5,0.0,4.5,-7.805448,110.398507
75%,379.25,3000.0,4.6,-7.748134,110.464673
max,510.0,500000.0,5.0,-7.565783,110.710377


In [38]:
# Since place and user data might come from different sources compared to the previous datasets, we treat the IDs individually. 
# Therefore, we need to restart the place and user IDs from the next number following the latest IDs from the previous datasets.

# Adding 252 to place_id to restart from 253.
ytp_df["place_id"] = (ytp_df["place_id"] + 252).astype("int64")
ytp_df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude
count,476.0,476.0,476.0,473.0,473.0
mean,504.64916,6128.151261,4.445588,-7.822201,110.395391
std,146.79256,33377.085829,0.400055,0.139608,0.121214
min,253.0,0.0,1.0,-8.200347,109.913062
25%,377.75,0.0,4.3,-7.877766,110.353794
50%,504.5,0.0,4.5,-7.805448,110.398507
75%,631.25,3000.0,4.6,-7.748134,110.464673
max,762.0,500000.0,5.0,-7.565783,110.710377


4.3.3. Insight
- The required columns for the model and application features are the place ID, name, description, category, price, average rating, latitude, and longitude, along with user data (ID and ratings).
- The final data frames containing the necessary information are derived from the original datasets.
- Since this dataset doesn't include user data, I didn't include it in the data frame.
- To ensure consistency, data types and values must be correct, so info() and describe() functions were used for validation.
- After reviewing the data information, I corrected the data types: int64 for ID and price (since they should not contain decimals), object for text fields (name, description, category), and float64 for numerical fields like ratings, latitude, and longitude (since they are required for calculations).
- Additionally, after reviewing the data description, I adjusted the place ID values to start from 253 instead of 1, resulting the place ID range from 253-762.

#### **Step 5:** Merging all cleaned datasets from step 4.
##### Goal: Combining all datasets into a unified structure to ensure a comprehensive and consistent dataset for analysis and application development.

In [39]:
# To ensure clarity and avoid confusion, the datasets are combined one by one in sequence.
# Merging dpy_df and itd_df_yk datasets row-wise since both have the same columns.
df = pd.concat([dpy_df, itd_df_yk], ignore_index=True)
df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,2,4.0
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,23,4.0
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,25,2.0
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,39,5.0
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,43,4.0


In [40]:
# Ensure both datasets have the same columns by adding missing columns with NaN values if necessary
ytp_df = ytp_df.reindex(columns=df.columns)

# Now concatenate them row-wise
df = pd.concat([df, ytp_df], ignore_index=True)
df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,2.0,4.0
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,23.0,4.0
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,25.0,2.0
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,39.0,5.0
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,4.5,-7.800671,110.367655,43.0,4.0


#### **Step 6:** Preprocessing merged dataset.
##### Goal: Clean the merged dataset by addressing any incorrect values to ensure consistency and readiness for modeling.

In [41]:
# Checking for duplicated rows in the dataset.
df.duplicated().sum()

46

In [42]:
# Handling duplicated rows by dropping the duplicates.
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [43]:
# Displaying information about the dataset to check data types, non-null counts, and potential missing or innacurate values.
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6171 entries, 0 to 6216
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      6171 non-null   int64  
 1   place_name    6171 non-null   object 
 2   description   5839 non-null   object 
 3   category      5842 non-null   object 
 4   price         6171 non-null   int64  
 5   place_rating  6171 non-null   float64
 6   latitude      6168 non-null   float64
 7   longitude     6168 non-null   float64
 8   user_id       5695 non-null   float64
 9   user_rating   5695 non-null   float64
dtypes: float64(5), int64(2), object(3)
memory usage: 530.3+ KB


In [44]:
df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,6171.0,6171.0,6171.0,6168.0,6168.0,5695.0,5695.0
mean,155.843461,18369.470102,4.464382,-7.886243,110.42246,302.040211,3.105004
std,129.421868,53096.784216,0.196374,0.165508,0.137398,173.17677,1.393901
min,1.0,0.0,1.0,-8.200347,109.913062,1.0,1.0
25%,70.0,2000.0,4.4,-8.004554,110.363666,153.0,2.0
50%,136.0,5000.0,4.5,-7.815278,110.418137,301.0,3.0
75%,207.0,10000.0,4.6,-7.791624,110.492651,453.0,4.0
max,762.0,500000.0,5.0,-7.565783,110.720854,600.0,5.0


In [45]:
# The incorrect data type in user_id cannot be handled unless the missing values are addressed first.
# If not, it will raise an error during type conversion.

# Counting the number of missing values in each column of the dataset.
df.isna().sum()

place_id          0
place_name        0
description     332
category        329
price             0
place_rating      0
latitude          3
longitude         3
user_id         476
user_rating     476
dtype: int64

In [46]:
# Missing text values cannot be handled computationally, so they need to be filled with specific values.
df["category"] = df["category"].fillna("Tidak diketahui")
df["description"] = df["description"].fillna("Deskripsi tidak tersedia.")
df.isna().sum()

place_id          0
place_name        0
description       0
category          0
price             0
place_rating      0
latitude          3
longitude         3
user_id         476
user_rating     476
dtype: int64

In [47]:
# For latitude and longitude, since there are only 3 missing values, the rows with the missing values will be dropped.
df.dropna(subset=["latitude", "longitude"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6168 entries, 0 to 6216
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      6168 non-null   int64  
 1   place_name    6168 non-null   object 
 2   description   6168 non-null   object 
 3   category      6168 non-null   object 
 4   price         6168 non-null   int64  
 5   place_rating  6168 non-null   float64
 6   latitude      6168 non-null   float64
 7   longitude     6168 non-null   float64
 8   user_id       5695 non-null   float64
 9   user_rating   5695 non-null   float64
dtypes: float64(5), int64(2), object(3)
memory usage: 530.1+ KB


In [48]:
df.isna().sum()

place_id          0
place_name        0
description       0
category          0
price             0
place_rating      0
latitude          0
longitude         0
user_id         473
user_rating     473
dtype: int64

In [49]:
# Since user IDs are unique, their missing values need to be assigned by new values.

# Step 1: Find the current maximum value of user_id
current_max_user_id = int(df["user_id"].max())
# Step 2: Identify rows with missing user_id
missing_user_ids = df[df["user_id"].isna()]
# Step 3: Generate sequential IDs as floats, starting from max_place_id + 1
seq_user_ids = [float(i) for i in range(current_max_user_id + 1, current_max_user_id + 1 + len(missing_user_ids))]
# Step 4: Fill the missing user_id values with the sequential IDs
df.loc[missing_user_ids.index, "user_id"] = seq_user_ids

df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,6168.0,6168.0,6168.0,6168.0,6168.0,6168.0,5695.0
mean,155.781453,18373.216602,4.464446,-7.886243,110.42246,343.064202,3.105004
std,129.422774,53109.253519,0.196348,0.165508,0.137398,222.230647,1.393901
min,1.0,0.0,1.0,-8.200347,109.913062,1.0,1.0
25%,70.0,2000.0,4.4,-8.004554,110.363666,164.0,2.0
50%,136.0,5000.0,4.5,-7.815278,110.418137,326.0,3.0
75%,207.0,10000.0,4.6,-7.791624,110.492651,488.0,4.0
max,762.0,500000.0,5.0,-7.565783,110.720854,1073.0,5.0


In [50]:
df.isna().sum()

place_id          0
place_name        0
description       0
category          0
price             0
place_rating      0
latitude          0
longitude         0
user_id           0
user_rating     473
dtype: int64

In [51]:
# With all missing values in user_id handled, the data type can now be safely converted to int64.
df = df.astype({"user_id": "int64"})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6168 entries, 0 to 6216
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   place_id      6168 non-null   int64  
 1   place_name    6168 non-null   object 
 2   description   6168 non-null   object 
 3   category      6168 non-null   object 
 4   price         6168 non-null   int64  
 5   place_rating  6168 non-null   float64
 6   latitude      6168 non-null   float64
 7   longitude     6168 non-null   float64
 8   user_id       6168 non-null   int64  
 9   user_rating   5695 non-null   float64
dtypes: float64(4), int64(3), object(3)
memory usage: 659.1+ KB


In [52]:
# Assigning arbitrary values to missing user ratings may distort the accuracy of place ratings.

# To maintain accuracy, place ratings are calculated using the mean of available user ratings for each place_id.
# Calculate mean of user_rating for each place_id, ignoring NaN.
average_ratings = df.groupby("place_id")["user_rating"].mean()
# Map the average user rating to place_rating.
new_df = df.copy()
new_df["place_rating"] = new_df["place_id"].map(average_ratings)

new_df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,2,4.0
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,23,4.0
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,25,2.0
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,39,5.0
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,43,4.0


In [53]:
# Now that the place ratings are handled, missing user ratings can be filled using the place ratings.

new_df.isna().sum()

place_id          0
place_name        0
description       0
category          0
price             0
place_rating    473
latitude          0
longitude         0
user_id           0
user_rating     473
dtype: int64

In [54]:
# The recent missing values indicate that some places have not yet been rated by users. 
# However, the previous dataframe contains place ratings, which can be used to fill the missing values.
new_df["place_rating"] = new_df["place_rating"].fillna(df["place_rating"]) 
new_df["user_rating"] = new_df["user_rating"].fillna(new_df["place_rating"])
new_df.isna().sum()

place_id        0
place_name      0
description     0
category        0
price           0
place_rating    0
latitude        0
longitude       0
user_id         0
user_rating     0
dtype: int64

In [55]:
# Since the application's features require two languages (Bahasa Indonesia and English), 
# the current place descriptions and categories need to be translated into English.

# ID descriptions and categories.
new_df["description_id"] = new_df["description"]
new_df["category_id"] = new_df["description"]
new_df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating,description_id,category_id
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,2,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,23,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,25,2.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,39,5.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,43,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...


In [56]:
# Function to handle error when translating.
def translate_batch(batch_new_df, column_name):
    translated_texts = []
    for text in batch_new_df[column_name]:
        try:
            translated = GoogleTranslator(source="id", target="en").translate(text)
            translated_texts.append(translated)  # Directly append translated text
        except Exception as e:
            print(f"Error translating: {text}. Error: {e}")
            translated_texts.append(text)  # Saving original text if error occurs
    return translated_texts

In [57]:
# Dividing dataset into batches.
batch_size = 1000
batches = [new_df.iloc[i:i + batch_size] for i in range(0, len(new_df), batch_size)]

In [58]:
# Columns to translate.
columns_to_translate = ["description_id", "category_id"]
translated_columns = {col: [] for col in columns_to_translate}

In [59]:
# Processing each batch
for i, batch in enumerate(batches):
    print(f"Translating batch {i + 1} of {len(batches)}")
    for col in columns_to_translate:
        translated_columns[col].extend(translate_batch(batch, col))

Translating batch 1 of 7
Translating batch 2 of 7
Translating batch 3 of 7
Translating batch 4 of 7
Translating batch 5 of 7
Translating batch 6 of 7
Translating batch 7 of 7


In [60]:
# EN descriptions and categories.
for col in columns_to_translate:
    new_col_name = col.replace("_id", "_en")
    new_df[new_col_name] = translated_columns[col]
    
new_df.head()

Unnamed: 0,place_id,place_name,description,category,price,place_rating,latitude,longitude,user_id,user_rating,description_id,category_id,description_en,category_en
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,2,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,..."
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,23,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,..."
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,25,2.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,..."
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,39,5.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,..."
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,6000,2.72,-7.800671,110.367655,43,4.0,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,..."


In [61]:
# Since the dataframe already contains the "description_id", "category_id", "description_en", and "category_en" columns, 
# the "description" and "category" columns are no longer needed.

new_df = new_df[[
    "place_id", "place_name", 
    "description_id", "category_id", 
    "description_en", "category_en", 
    "price", "place_rating", 
    "latitude", "longitude", 
    "user_id", "user_rating"
]]
new_df.head()

Unnamed: 0,place_id,place_name,description_id,category_id,description_en,category_en,price,place_rating,latitude,longitude,user_id,user_rating
0,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,...",6000,2.72,-7.800671,110.367655,2,4.0
1,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,...",6000,2.72,-7.800671,110.367655,23,4.0
2,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,...",6000,2.72,-7.800671,110.367655,25,2.0
3,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,...",6000,2.72,-7.800671,110.367655,39,5.0
4,1,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,"Taman Pintar Yogyakarta (Javanese: Hanacaraka,...","Taman Pintar Yogyakarta (Javanese: Hanacaraka,...",6000,2.72,-7.800671,110.367655,43,4.0


In [62]:
# Last check of data information.
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6168 entries, 0 to 6216
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   place_id        6168 non-null   int64  
 1   place_name      6168 non-null   object 
 2   description_id  6168 non-null   object 
 3   category_id     6168 non-null   object 
 4   description_en  6168 non-null   object 
 5   category_en     6168 non-null   object 
 6   price           6168 non-null   int64  
 7   place_rating    6168 non-null   float64
 8   latitude        6168 non-null   float64
 9   longitude       6168 non-null   float64
 10  user_id         6168 non-null   int64  
 11  user_rating     6168 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 755.5+ KB


In [63]:
# Last check of data description.
new_df.describe()

Unnamed: 0,place_id,price,place_rating,latitude,longitude,user_id,user_rating
count,6168.0,6168.0,6168.0,6168.0,6168.0,6168.0,6168.0
mean,155.781453,18373.216602,3.207863,-7.886243,110.42246,343.064202,3.207863
std,129.422774,53109.253519,0.465079,0.165508,0.137398,222.230647,1.390554
min,1.0,0.0,1.0,-8.200347,109.913062,1.0,1.0
25%,70.0,2000.0,2.933333,-8.004554,110.363666,164.0,2.0
50%,136.0,5000.0,3.125,-7.815278,110.418137,326.0,3.0
75%,207.0,10000.0,3.333333,-7.791624,110.492651,488.0,4.4
max,762.0,500000.0,5.0,-7.565783,110.720854,1073.0,5.0


## Exporting the Preprocessed Data

From the last check, the final dataframe is ensured to have correct values, be consistent, and be ready for use in a machine learning model.  
Hence, it's time to export it to a CSV file.

In [64]:
# Exporting the cleaned dataframe to a CSV file.
new_df.to_csv("data.csv", index=False)