In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate

In [None]:
# Load dataset
df = pd.read_csv('/content/dataset_dengan_kategori_diperbaiki (2).csv')
df.head()

Unnamed: 0,nama_destinasi,alamat,website,google_link,longitude,latitude,review_total,average_rating,letak_provinsi,kategori
0,Pantai Sambolo Anyer,"Jl. Raya Anyer-Sirih, Bandulu, Banten 42112",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Sambo...,105.882,-6.099,4721.0,4.2,Banten,Wisata Alam
1,Tanjung Pasir Beach,"Tanjung Pasir, Teluknaga, Tangerang Regency, B...",Tidak Tersedia,https://www.google.com/maps/place/Tanjung+Pasi...,106.679,-6.014,6502.0,4.0,Banten,Wisata Alam
2,Pantai Bagedur,"Sukamanah, Malingping, Lebak Regency, Banten 4...",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Baged...,105.992,-6.814,2139.0,4.3,Banten,Wisata Alam
3,Tanjung Lesung Beach,Banten,Tidak Tersedia,https://www.google.com/maps/place/Tanjung+Lesu...,105.656,-6.479,2187.0,4.4,Banten,Wisata Alam
4,Pantai Pasir Putih Sirih,"Jl. Raya Karang Bolong, Kamasan, Kec. Cinangka...",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Pasir...,105.879,-6.112,3739.0,4.3,Banten,Wisata Alam


In [None]:
# CLEANING DATA

# mengecek missing value, kecocokan tipe data, serta jumlah duplikasi
df.info()
df.isna().sum()
print("jumlah duplikasi: ", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1843 entries, 0 to 1842
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama_destinasi  1843 non-null   object 
 1   alamat          1843 non-null   object 
 2   website         1843 non-null   object 
 3   google_link     1843 non-null   object 
 4   longitude       1842 non-null   float64
 5   latitude        1842 non-null   object 
 6   review_total    1832 non-null   float64
 7   average_rating  1832 non-null   float64
 8   letak_provinsi  1843 non-null   object 
 9   kategori        1843 non-null   object 
dtypes: float64(3), object(7)
memory usage: 144.1+ KB
jumlah duplikasi:  0


terdapat missing value pada kolom longitude, latitude, review total, dan avg rating. terdapat tipe data latitude yang tidak sesuai

In [None]:
#menghapus missing value pada kolom website, longitude, latitude, review_total, avg rating
df.dropna(axis=0, inplace=True)
df.isna().sum()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1830 entries, 0 to 1842
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama_destinasi  1830 non-null   object 
 1   alamat          1830 non-null   object 
 2   website         1830 non-null   object 
 3   google_link     1830 non-null   object 
 4   longitude       1830 non-null   float64
 5   latitude        1830 non-null   object 
 6   review_total    1830 non-null   float64
 7   average_rating  1830 non-null   float64
 8   letak_provinsi  1830 non-null   object 
 9   kategori        1830 non-null   object 
dtypes: float64(3), object(7)
memory usage: 157.3+ KB


In [None]:
#mengubah tipe data latitude
df["latitude"] = df["latitude"].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1830 entries, 0 to 1842
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama_destinasi  1830 non-null   object 
 1   alamat          1830 non-null   object 
 2   website         1830 non-null   object 
 3   google_link     1830 non-null   object 
 4   longitude       1830 non-null   float64
 5   latitude        1830 non-null   float64
 6   review_total    1830 non-null   float64
 7   average_rating  1830 non-null   float64
 8   letak_provinsi  1830 non-null   object 
 9   kategori        1830 non-null   object 
dtypes: float64(4), object(6)
memory usage: 157.3+ KB


In [None]:
df.head()

Unnamed: 0,nama_destinasi,alamat,website,google_link,longitude,latitude,review_total,average_rating,letak_provinsi,kategori
0,Pantai Sambolo Anyer,"Jl. Raya Anyer-Sirih, Bandulu, Banten 42112",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Sambo...,105.882,-6.099,4721.0,4.2,Banten,Wisata Alam
1,Tanjung Pasir Beach,"Tanjung Pasir, Teluknaga, Tangerang Regency, B...",Tidak Tersedia,https://www.google.com/maps/place/Tanjung+Pasi...,106.679,-6.014,6502.0,4.0,Banten,Wisata Alam
2,Pantai Bagedur,"Sukamanah, Malingping, Lebak Regency, Banten 4...",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Baged...,105.992,-6.814,2139.0,4.3,Banten,Wisata Alam
3,Tanjung Lesung Beach,Banten,Tidak Tersedia,https://www.google.com/maps/place/Tanjung+Lesu...,105.656,-6.479,2187.0,4.4,Banten,Wisata Alam
4,Pantai Pasir Putih Sirih,"Jl. Raya Karang Bolong, Kamasan, Kec. Cinangka...",Tidak Tersedia,https://www.google.com/maps/place/Pantai+Pasir...,105.879,-6.112,3739.0,4.3,Banten,Wisata Alam


In [None]:
# MODEL LINIER REGRESI UNTUK REKOMENDASI DESTINASI WISATA BERDASARKAN PROVINSI

# menyiapkan data training dengan menambahkan kolom encoding untuk provinsi dan kategori
df['kategori_encoded'] = df['kategori'].astype('category').cat.codes
df['provinsi_encoded'] = df['letak_provinsi'].astype('category').cat.codes

# memilih fitur yang digunakan
features = ['provinsi_encoded', 'kategori_encoded', 'average_rating']
target = 'average_rating'

In [None]:
# split data untuk training dan testing
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# train model linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# evaluasi model - nilai mse
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# evaluasi model - nilai r2
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

Mean Squared Error: 1.8697727739869611e-31
R² Score: 1.0


In [None]:
# Fungsi untuk filter destinasi berdasarkan provinsi dan kategori wisata
def recommend_places(province, category):
    # filter berdasarkan provinsi dan kategori
    filtered_df = df[(df['letak_provinsi'] == province) & (df['kategori'] == category)]

    # jika data tidak ada yang cocok
    if filtered_df.empty:
        return "No matching places found."

    # urutkan destinasi berdasarkan rating dan jumlah review tertinggi
    filtered_data = filtered_df.sort_values(by=['average_rating', 'review_total'], ascending=[False, False])

    # pilih kolom-kolom yang dibutuhkan untuk output
    recommended_places = filtered_data[['nama_destinasi', 'kategori', 'average_rating']]
    return recommended_places

In [None]:
# fungsi rekomendasi setelah train model
province = "Jawa Timur"
category = "Wisata Alam"

recommendations = recommend_places(province, category)

# menampilkan hasil rekomendasi
if recommendations.empty:
    print("No matching places found.")
else:
    print("Recommended places (most popular):")
    print(recommendations.head(10))

Recommended places (most popular):
                          nama_destinasi     kategori  average_rating
1842                     3SECOND JOMBANG  Wisata Alam             5.0
1698                  Desa Wisata Keling  Wisata Alam             5.0
1678  Bromo Tengger Semeru National Park  Wisata Alam             4.8
1746                Alun - Alun Surabaya  Wisata Alam             4.8
1637               Tumpak Sewu Waterfall  Wisata Alam             4.8
1811                   Tunjungan Plaza 4  Wisata Alam             4.8
1730                   Tunjungan Romansa  Wisata Alam             4.8
1652     Wisata Alam Sumber Jiput Kediri  Wisata Alam             4.8
1733         Kota Wisata Batu jawa timur  Wisata Alam             4.8
1771                     Tunjungan Plaza  Wisata Alam             4.7
