In [None]:
#Import Library
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

#Read Dataset CSV
df = pd.read_csv("Hotel Saudi Arabia.csv")
df.head()

FileNotFoundError: ignored

In [None]:
#Analisa Dataset
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2375 entries, 0 to 2374
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   hotel_name   2375 non-null   object 
 1   description  2375 non-null   object 
 2   review       2186 non-null   object 
 3   score        2186 non-null   float64
 4   country      2375 non-null   object 
 5   city         2375 non-null   object 
dtypes: float64(1), object(5)
memory usage: 111.5+ KB


In [None]:
#Fungsi Untuk Menampilkan Deskripsi hotel, Nama, Alamat
def print_description(index):
    example = df[df.index == index][['description', 'hotel_name', 'city']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Kota:', example[2])        

In [None]:
#Cek
print_description(1)

نزل مينا 110 has well-equipped accommodations featuring free WiFi in Riyadh, 2.2 miles from Riyadh Gallery Mall and 2.6 miles from Panorama Mall.
Nama: نزل مينا 110
Kota: Riyadh


In [None]:
#Cek
print_description(50)

Located in Riyadh and with King Khalid Grand Mosque reachable within 5 miles, Marriott Riyadh Diplomatic Quarter provides express check-in and check-out, allergy-free rooms, an outdoor swimming pool,...
Nama: Marriott Riyadh Diplomatic Quarter
Kota: Riyadh


In [None]:
#Preprocessing
import nltk
nltk.download('stopwords')
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # mengubah jadi huruf kecil
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text

# Buat kolom tambahan untuk data description yang telah dibersihkan   
df['desc_clean'] = df['description'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Fungsi Deskripsi kedua (Setelah preprocessing)
def print_description_clean(index):
    example = df[df.index == index][['desc_clean', 'hotel_name', 'city']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Kota:', example[2])

In [None]:
#Menggunakan TF-IDF dan Cosine Similarity Untuk Mengubah Data menjadi angka matriks
df.set_index('hotel_name', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['desc_clean'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.00000000e+00, 3.63352306e-03, 1.00963091e-02, ...,
        4.75578236e-03, 8.15719055e-03, 7.23093794e-04],
       [3.63352306e-03, 1.00000000e+00, 1.04918507e-02, ...,
        3.15183127e-03, 1.60300920e-02, 3.94651586e-03],
       [1.00963091e-02, 1.04918507e-02, 1.00000000e+00, ...,
        4.05285515e-03, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.75578236e-03, 3.15183127e-03, 4.05285515e-03, ...,
        1.00000000e+00, 7.72778650e-02, 7.12976957e-02],
       [8.15719055e-03, 1.60300920e-02, 0.00000000e+00, ...,
        7.72778650e-02, 1.00000000e+00, 1.41175201e-01],
       [7.23093794e-04, 3.94651586e-03, 0.00000000e+00, ...,
        7.12976957e-02, 1.41175201e-01, 1.00000000e+00]])

In [None]:
# Set index utama di kolom 'name' untuk melakukan rekomendasi hotel
indices = pd.Series(df.index)
indices[:50]

0                     Reef Al Malaz Hotel International
1                                          نزل مينا 110
2                      Hilton Riyadh Hotel & Residences
3                               Executives Hotel / KAFD
4                                          OYO 359 Amar
5                         Capital O 419 Al Safeer Hotel
6                                     Rose Garden Hotel
7     Sama Al Nakheel Furnished Apartments-Families ...
8                                 Centro Waha by Rotana
9                      OYO 150 Al Hamra Palace Al Aswaq
10                   Courtyard by Marriott Riyadh Olaya
11                        الغرف الهادئة للوحدات السكنية
12                                    Almakan Hotel 108
13                                          Awrad Royal
14                                 Al Waha Palace Hotel
15                           Capital O 162 Brzeen Hotel
16                                    Almakan Hotel 105
17                        Elite Suites Hotel - A

In [None]:
#Modelling
def recommendations(name, cos_sim = cos_sim):
    
    recommended_hotel = []
    
    # Mengambil nama hotel berdasarkan variabel indicies
    idx = indices[indices == name].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # mengambil index dan dibuat 10 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    for i in top_10_indexes:
        recommended_hotel.append(list(df.index)[i])
        
    return recommended_hotel

In [None]:
#Cek rekomendasi hotel
recommendations('Hilton Riyadh Hotel & Residences')

['Jeddah Hilton',
 'Rahty Home Hotel Apartments',
 'Ray Kady Hotel Apartment 1',
 'Dorar Darea Hotel Apartments - Al Mughrizat',
 'Goot Resorts',
 'Rayatna For Furnished Apartments 3',
 'Taleen Granada hotel apartments',
 'Duset Hotel Suites',
 'Hawraa Ishbeelyah (Families Only)',
 'Narcissus Hotel and SPA Riyadh']

In [None]:
from sklearn.metrics import cclassification_report
print (classification_report(recommended_hotel))

ImportError: ignored