# **Preprocessing Data**

In [420]:
#import library
import pandas as pd
import numpy as np
import re
import ast
from haversine import haversine, Unit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib


In [None]:
#load data
data = ("./data/Dataset_Resto_Unair.csv")
df = pd.read_csv(data)

In [422]:
df.head()

Unnamed: 0,title,address,totalScore,categories,location
0,Bakso CITRA SATU PUTRA,"Jl. Raya Mulyosari No.177-179, Kalisari, Kec. ...",4.7,['Restoran Bakso'],"{'lat': -7.2707337, 'lng': 112.7969433}"
1,Bakso asli solo Pak Tono,"Jl. Gebang Lor No.49, Gebang Putih, Kec. Sukol...",4.5,['Restoran Bakso'],"{'lat': -7.2812233, 'lng': 112.7873977}"
2,Bakso Pak Anton,"Jl. Wisma Permai Tengah I No.CC-21, Kejawaan P...",4.8,"['Restoran Bakso', 'Rumah Makan']","{'lat': -7.2741205, 'lng': 112.7868231}"
3,Bakso Kepala Sapi,"Jl. Klampis Jaya No.47, Klampis Ngasem, Kec. S...",4.3,['Restoran Bakso'],"{'lat': -7.28232, 'lng': 112.7763487}"
4,Warung Bakso Dan Mie Wijaya,"Jl. Mulyorejo Tengah No.73, Mulyorejo, Kec. Mu...",4.5,"['Restoran Bakso', 'Restoran Mie']","{'lat': -7.2678847, 'lng': 112.7780222}"


In [423]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       681 non-null    object 
 1   address     681 non-null    object 
 2   totalScore  681 non-null    float64
 3   categories  681 non-null    object 
 4   location    681 non-null    object 
dtypes: float64(1), object(4)
memory usage: 26.7+ KB


In [424]:
df.isnull().sum()

title         0
address       0
totalScore    0
categories    0
location      0
dtype: int64

In [425]:
df.duplicated().sum()

0

In [426]:
df = df.drop_duplicates()

In [427]:
df.describe()

Unnamed: 0,totalScore
count,681.0
mean,4.569457
std,0.366551
min,1.8
25%,4.4
50%,4.6
75%,4.8
max,5.0


# Preprocessing Data Categories

In [428]:
df['categories']

0                      ['Restoran Bakso']
1                      ['Restoran Bakso']
2       ['Restoran Bakso', 'Rumah Makan']
3                      ['Restoran Bakso']
4      ['Restoran Bakso', 'Restoran Mie']
                      ...                
676                              ['Kafe']
677                        ['Kedai Kopi']
678                        ['Kedai Kopi']
679                        ['Kedai Kopi']
680                        ['Kedai Kopi']
Name: categories, Length: 681, dtype: object

In [429]:
def remove_symbol(text):
    joined = ' '.join(text)
    cleaned_text = re.sub(r'[^A-Za-z\s]','',text)
    return cleaned_text

In [430]:
df['categories'] = df['categories'].apply(remove_symbol)

In [431]:
df['title'].tolist()

['Bakso CITRA SATU PUTRA',
 'Bakso asli solo Pak Tono',
 'Bakso Pak Anton',
 'Bakso Kepala Sapi',
 'Warung Bakso Dan Mie Wijaya',
 'Bakso & Mie Bondowoso',
 'Bakso Pangsit Mie Ayam Sensasi Rasa',
 'Bakso Dan Mie Ayam "Barokah"',
 'Bakso Solo Goyang Lidah (GOLI)',
 'Bakso Bem',
 'Bakso Solo Cak Awi',
 'Bakso jumbo sinar baru 2',
 'Bakso bratang Pindahan (ITATS)',
 'Warung LA',
 'Warung Kampus',
 'Warung Mbak Khol',
 'Warung Pak No Lamongan. ( Penyetan & Nasi Goreng )',
 'Warung Bagus',
 'Warung Makan Bu Yati',
 'Warung Sultan',
 'Warung Emak Murah Enak',
 'Warung Cak Nari Munos',
 'Warung Mbak Sieh',
 'Warung Penyetan Pak Kumis',
 'Warung Cak Di',
 'Warung Bu Warni Surabaya ( Nasi Petis-Specialist Cumi Hitam)',
 'Warung Satria',
 'Warung Mami 88',
 'Warung LA 2',
 'Bakso Daging Sapi Pak Sabar',
 'Bakso Bonnet',
 'Bakso Pak Djo',
 'Bakso Daging Sapi P.Sabar',
 'Bakso Pak War',
 'Bakso Pak Pek',
 'Mie Ayam & Bakso Solo',
 'Warung Tegal Gebang',
 'Warung Fadhila',
 'Warung Pak Hasyim',
 'W

# preprocessing data jarak

In [432]:
#Install library untuk hitung jarak dari kampus
!pip install haversine



You should consider upgrading via the 'C:\Users\LENOVO\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [433]:
df['location'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 681 entries, 0 to 680
Series name: location
Non-Null Count  Dtype 
--------------  ----- 
681 non-null    object
dtypes: object(1)
memory usage: 5.4+ KB


In [434]:
df['location'] = df['location'].apply(ast.literal_eval)

In [435]:
df['location'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 681 entries, 0 to 680
Series name: location
Non-Null Count  Dtype 
--------------  ----- 
681 non-null    object
dtypes: object(1)
memory usage: 5.4+ KB


In [436]:
#bujur dan lintang kampus a, b, dan c unair
kampus_a = (-7.2755,112.7634)
kampus_b = (-7.2713, 112.7688)
kampus_c = (-7.2697,112.7848)


In [437]:
#hitung jarak dalam satuan km dari kampus c
df['Jarak_KampusC'] = df['location'].apply(lambda x : haversine(kampus_c,( x['lat'], x['lng']), unit = Unit.KILOMETERS), 2)

  df['Jarak_KampusC'] = df['location'].apply(lambda x : haversine(kampus_c,( x['lat'], x['lng']), unit = Unit.KILOMETERS), 2)


In [438]:
#hitung jarak dalam satuan km dari kampus b
df['Jarak_KampusB'] = df['location'].apply(lambda x : haversine(kampus_b,( x['lat'], x['lng']), unit = Unit.KILOMETERS),2)

  df['Jarak_KampusB'] = df['location'].apply(lambda x : haversine(kampus_b,( x['lat'], x['lng']), unit = Unit.KILOMETERS),2)


In [439]:
#hitung jarak dalam satuan km dari kampus a
df['Jarak_KampusA'] = df['location'].apply(lambda x : haversine(kampus_a,( x['lat'], x['lng']), unit = Unit.KILOMETERS),2)

  df['Jarak_KampusA'] = df['location'].apply(lambda x : haversine(kampus_a,( x['lat'], x['lng']), unit = Unit.KILOMETERS),2)


In [440]:
df

Unnamed: 0,title,address,totalScore,categories,location,Jarak_KampusC,Jarak_KampusB,Jarak_KampusA
0,Bakso CITRA SATU PUTRA,"Jl. Raya Mulyosari No.177-179, Kalisari, Kec. ...",4.7,Restoran Bakso,"{'lat': -7.2707337, 'lng': 112.7969433}",1.344342,3.104870,3.737606
1,Bakso asli solo Pak Tono,"Jl. Gebang Lor No.49, Gebang Putih, Kec. Sukol...",4.5,Restoran Bakso,"{'lat': -7.2812233, 'lng': 112.7873977}",1.312979,2.329260,2.722356
2,Bakso Pak Anton,"Jl. Wisma Permai Tengah I No.CC-21, Kejawaan P...",4.8,Restoran Bakso Rumah Makan,"{'lat': -7.2741205, 'lng': 112.7868231}",0.539819,2.012544,2.588117
3,Bakso Kepala Sapi,"Jl. Klampis Jaya No.47, Klampis Ngasem, Kec. S...",4.3,Restoran Bakso,"{'lat': -7.28232, 'lng': 112.7763487}",1.684682,1.481480,1.617075
4,Warung Bakso Dan Mie Wijaya,"Jl. Mulyorejo Tengah No.73, Mulyorejo, Kec. Mu...",4.5,Restoran Bakso Restoran Mie,"{'lat': -7.2678847, 'lng': 112.7780222}",0.774372,1.085798,1.821619
...,...,...,...,...,...,...,...,...
676,WARKOP 3000,"Jl. Semolowaru No.44, Semolowaru, Kec. Sukolil...",4.4,Kafe,"{'lat': -7.3006098, 'lng': 112.7743827}",3.623989,3.316764,3.043539
677,Warkop 78 & TOKO IVA,"Depan parkiran kampus unair B, Jl. Srikana No....",5.0,Kedai Kopi,"{'lat': -7.2741453, 'lng': 112.7584342}",2.949874,1.186320,0.568064
678,Warkop R-11,"Jl. Semolowaru Elok No.11 blok R, Semolowaru, ...",4.8,Kedai Kopi,"{'lat': -7.307254, 'lng': 112.775239}",4.306918,4.060499,3.764608
679,Warkop DKN,"Jl. Medokan Semampir AWS No.22, RT.001/RW.006,...",4.6,Kedai Kopi,"{'lat': -7.3097995, 'lng': 112.7772405}",4.536154,4.381010,4.108095


In [441]:
df.to_csv('data_resto_with_jarak.csv', index= False)

# **Feature Extraction**

In [442]:
#tf idf untuk  sama catgories
data = df.copy()
tfidf = TfidfVectorizer()
data['combined'] = data['categories'] +' '+ data['title']
data['combined'] = data['combined'].apply(remove_symbol)

joblib.dump(tfidf, 'tfidf.pkl')

tfidf_categories = tfidf.fit_transform(data['combined'])

joblib.dump(tfidf_categories, 'tfidf_matrix.pkl')

['tfidf_matrix.pkl']

In [443]:
data['combined']

0                  Restoran Bakso Bakso CITRA SATU PUTRA
1                Restoran Bakso Bakso asli solo Pak Tono
2             Restoran Bakso Rumah Makan Bakso Pak Anton
3                       Restoran Bakso Bakso Kepala Sapi
4      Restoran Bakso Restoran Mie Warung Bakso Dan M...
                             ...                        
676                                         Kafe WARKOP 
677                         Kedai Kopi Warkop   TOKO IVA
678                                  Kedai Kopi Warkop R
679                                Kedai Kopi Warkop DKN
680                            Kedai Kopi Warkop Lestari
Name: combined, Length: 681, dtype: object

In [444]:
cosin = cosine_similarity(tfidf_categories)
cosin_df = pd.DataFrame(
    cosin,
    index= data['combined'],
    columns= data['combined']
)

joblib.dump(cosin_df, 'cosinesimilarity.pkl')

['cosinesimilarity.pkl']

In [445]:
cosin_df.dtypes

combined
Restoran Bakso Bakso CITRA SATU PUTRA                      float64
Restoran Bakso Bakso asli solo Pak Tono                    float64
Restoran Bakso Rumah Makan Bakso Pak Anton                 float64
Restoran Bakso Bakso Kepala Sapi                           float64
Restoran Bakso Restoran Mie Warung Bakso Dan Mie Wijaya    float64
                                                            ...   
Kafe WARKOP                                                float64
Kedai Kopi Warkop   TOKO IVA                               float64
Kedai Kopi Warkop R                                        float64
Kedai Kopi Warkop DKN                                      float64
Kedai Kopi Warkop Lestari                                  float64
Length: 681, dtype: object

In [446]:
cosin_df.columns = cosin_df.columns.str.strip().str.lower()
cosin_df.index = cosin_df.index.str.strip().str.lower()

cosin_df.index

Index(['restoran bakso bakso citra satu putra',
       'restoran bakso bakso asli solo pak tono',
       'restoran bakso rumah makan bakso pak anton',
       'restoran bakso bakso kepala sapi',
       'restoran bakso restoran mie warung bakso dan mie wijaya',
       'restoran bakso restoran mie bakso  mie bondowoso',
       'restoran mie restoran bakso bakso pangsit mie ayam sensasi rasa',
       'restoran bakso restoran mie bakso dan mie ayam barokah',
       'restoran bakso restoran mie bakso solo goyang lidah goli',
       'restoran bakso bakso bem',
       ...
       'kedai kopi kafe warkop gresik semolowaru', 'kafe warkop gayam',
       'kafe seni warkop  safaraz', 'kedai kopi warkop adi',
       'kedai kopi warkop cak kasnun', 'kafe warkop',
       'kedai kopi warkop   toko iva', 'kedai kopi warkop r',
       'kedai kopi warkop dkn', 'kedai kopi warkop lestari'],
      dtype='object', name='combined', length=681)

In [447]:
def display_name(category_query, similarity_df, item_data,base,  k=5):
    category_query = category_query.strip().lower()
    
    # Cari kolom similarity yang cocok berdasarkan string
    matched_cols = [col for col in similarity_df.columns if category_query in col.lower()]
    
    if not matched_cols:
        print(f"Tidak ditemukan hasil untuk kategori: '{category_query}'")
        return pd.DataFrame()

    selected_col = matched_cols[0]
    similarity = similarity_df[selected_col]

    top_k = similarity.nlargest(k)
    top_k = top_k.groupby(level= 0).max()
    ## print(top_k)

    # Persiapan data
    item_data['categories_cleaned'] = item_data['categories'].apply(remove_symbol)
    item_data['title_cleaned'] = item_data['title'].apply(remove_symbol)
    item_data['combined'] = item_data['categories_cleaned'] + ' ' + item_data['title_cleaned']
    item_data['combined'] = item_data['combined'].str.strip().str.lower()
    ##print(top_k.index.tolist())

    #print(item_data['combined'].tolist()[:10])  # tampilkan 10 data awal

    # Ambil yang cocok berdasarkan index di top_k
    recom = item_data[item_data['combined'].isin(top_k.index)].copy()

    ##print(recom[['combined']])
    #missing_items = set(top_k.index) - set(item_data['combined'])
    #print("Tidak ditemukan di item_data:", missing_items)
    recom['similarity'] = recom['combined'].map(top_k)

    #hitung jarak dari kampus mana
    base_column = f'Jarak_Kampus{base.upper()}'
    if base_column not in recom.columns:
        print(f"Masukkan dari UNAIR Kampus Mana")
    else:
        recom = recom.sort_values(by= [base_column], ascending= True)
    #ambil berdasarkan rating tertinggi dan similarity tertinggi 
    return recom.sort_values(by=[f'{base_column}','totalScore', 'similarity'], ascending=[True, False, False])

    


In [450]:
display_name(
    category_query= 'sate',
    similarity_df= cosin_df,
    item_data= df,
    base= 'C',
    k= 5
)



Unnamed: 0,title,address,totalScore,categories,location,Jarak_KampusC,Jarak_KampusB,Jarak_KampusA,categories_cleaned,title_cleaned,combined,similarity
540,SATE HALIM,"Jl. Manyar Kertoarjo No.114, RT.001/RW.06, Man...",4.7,Restoran Sate,"{'lat': -7.2806375, 'lng': 112.7711573}",1.934818,1.070345,1.028803,Restoran Sate,SATE HALIM,restoran sate sate halim,1.0
576,Sate Madurasa,"Jl. Raya Menur No.5, Mojo, Kec. Gubeng, Suraba...",4.1,Restoran Sate,"{'lat': -7.2753229, 'lng': 112.7625344}",2.534247,0.823236,0.097485,Restoran Sate,Sate Madurasa,restoran sate sate madurasa,0.549985
613,Sate Ayam Sate Kambing Pak Buser,"Jl. Manyar Rejo I No.2, RT.001/RW.05, Menur Pu...",4.8,Restoran Sate,"{'lat': -7.2940424, 'lng': 112.7642162}",3.532852,2.578888,2.063788,Restoran Sate,Sate Ayam Sate Kambing Pak Buser,restoran sate sate ayam sate kambing pak buser,0.551117
541,Raja Sate,"Jl. Semolowaru Tengah I No.62, Semolowaru, Kec...",5.0,Restoran Sate,"{'lat': -7.3039676, 'lng': 112.7787338}",3.868686,3.794123,3.588939,Restoran Sate,Raja Sate,restoran sate raja sate,0.576401
581,Sate Babi Halim,"Jl. Raya Manyar No.84, Baratajaya, Kec. Gubeng...",4.3,Restoran Sate,"{'lat': -7.2979585, 'lng': 112.7618416}",4.035567,3.06204,2.503183,Restoran Sate,Sate Babi Halim,restoran sate sate babi halim,0.854884
