In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random

In [3]:
data = pd.read_csv('hotel.csv')
data.head()

Unnamed: 0,nama,alamat,deskripsi
0,Capital O 253 Topas Galeria Hotel,"Jl. Dr. Djundjunan No. 153, 40173 Bandung, Ind...",Berjarak 10 menit berkendara dari Bandara Inte...
1,Sheraton Bandung Hotel & Towers,"Jl. Ir H Juanda 390, 40135 Bandung, Indonesia",Sheraton Hotel & Towers menawarkan akomodasi b...
2,OYO 794 Ln 9 Bandung Residence,"Jalan Lemahnendeut No 9, Sukajadi, 40164 Bandu...","Berlokasi nyaman di Sukajadi, Bandung, OYO 794..."
3,OYO 226 LJ hotel,"Jl. Malabar No.2, Malabar, Lengkong, Dago, Asi...","OYO 226 LJ hotel di Bandung, Jawa Barat, tepat..."
4,OYO 230 Maleo Residence,"JI. Dangeur Indah II No. 15, Sukagalih, Sukaja...",OYO 230 Maleo Residence menawarkan akomodasi b...


In [4]:
data.describe()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   nama       90 non-null     object
 1   alamat     90 non-null     object
 2   deskripsi  90 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB


In [5]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() 
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text

# Buat kolom tambahan untuk data description yang telah dibersihkan   
data['deskripsi_new'] = data['deskripsi'].apply(clean_text)

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def pt_desc(index):
    example = data[data.index == index][['deskripsi_new', 'nama', 'alamat']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Alamat:', example[2])   

In [8]:
pt_desc(1)

sheraton hotel towers menawarkan akomodasi bintang 5 di tengah lanskap hijau di bandung semua kamar yang luas dilengkapi tv kabel layar datar hotel ini menawarkan kolam renang outdoor pusat spa dan restoran dengan pemandangan gunung akses wifi tersedia gratis di seluruh area hotel kamarkamar elegan memiliki interior modern perabotan dari kayu ringan dan jendelajendela besar masingmasing menyediakan area tempat duduk yang nyaman pemutar dvd dan kamar mandi pribadi dengan shower anda dapat berolahraga di gym atau menikmati perawatan tubuh di spa staf resepsionis siap melayani kebutuhan anda selama 24 jam aneka hidangan internasional dan asia ditawarkan di feast restaurant sementara minuman ringan disajikan di samsara lounge berbagai koktail dan makanan ringan juga tersedia di poolside terrace sheraton bandung hotel towers berselang 10 menit berkendara dari juanda culture park dan daerah dago tempat berbagai factory outlet bandara husein sastranegara dapat ditempuh dengan 30 menit berkend

In [9]:
data.set_index('nama', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data['deskripsi_new'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.07106689, 0.03075961, ..., 0.07474134, 0.0732575 ,
        0.01680878],
       [0.07106689, 1.        , 0.03508807, ..., 0.05947269, 0.08705608,
        0.01986701],
       [0.03075961, 0.03508807, 1.        , ..., 0.09113962, 0.05879732,
        0.06808138],
       ...,
       [0.07474134, 0.05947269, 0.09113962, ..., 1.        , 0.06321301,
        0.02205802],
       [0.0732575 , 0.08705608, 0.05879732, ..., 0.06321301, 1.        ,
        0.02245328],
       [0.01680878, 0.01986701, 0.06808138, ..., 0.02205802, 0.02245328,
        1.        ]])

In [10]:
indices = pd.Series(data.index)
indices[:50]

0                Capital O 253 Topas Galeria Hotel
1                  Sheraton Bandung Hotel & Towers
2                   OYO 794 Ln 9 Bandung Residence
3                                 OYO 226 LJ hotel
4                          OYO 230 Maleo Residence
5                        OYO 167 Dago's Hill Hotel
6                   OYO 794 Ln 9 Bandung Residence
7                       OYO 196 Horizone Residence
8     OYO 483 Flagship Tamansari Panoramic Bandung
9               OYO 295 Grha Ciumbuleuit Residence
10                            OYO 193 SM Residence
11              Capital O 874 Hotel Nyland Pasteur
12                            OYO 352 Hotel Sabang
13                                  Hilton Bandung
14             InterContinental Bandung Dago Pakar
15                                Aryaduta Bandung
16               Art Deco Luxury Hotel & Residence
17                            Crowne Plaza Bandung
18          Best Western Premier La Grande Bandung
19                         éL H

In [19]:
def rekomendasi(nama, cos_sim = cos_sim):
    
    rec = []
    
    idx = indices[indices == nama].index[0]

    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    top_10_indexes = list(score_series.iloc[1:11].index)
    
    for i in top_10_indexes:
        recommended_news.append(list(data.index)[i])
        
    return rec

In [20]:
rekomendasi('kolam renang')

IndexError: index 0 is out of bounds for axis 0 with size 0