In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re 
from datetime import datetime
import ast
import requests
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-10-14 15:18:53.633322: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-14 15:18:53.634510: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-14 15:18:53.637415: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-14 15:18:53.646187: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-14 15:18:53.660782: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register f

#### Functions

In [2]:
# Function to convert 12-hour time to 24-hour format
def convert_to_24hr(time_str):
    try:
        return datetime.strptime(time_str, '%I:%M %p').strftime('%H:%M')
    except ValueError:
        return None

# Function to infer AM/PM when missing
def infer_am_pm(open_time, close_time):
    # If open_time is missing AM/PM, infer based on close_time's AM/PM
    if "AM" in close_time and int(close_time.split(':')[0]) < 12:
        # Likely morning (e.g., close at 11:30 AM, so open is likely AM)
        return open_time + " AM"
    elif "PM" in close_time:
        # Likely afternoon/evening (e.g., close at 11:30 PM, so open is likely PM)
        return open_time + " PM"
    return open_time

# Function to extract opening and closing times in 24-hour format
def extract_open_close_times(hours):
    if pd.isna(hours):  # Check for NaN or missing values
        return pd.Series([None, None])
    
    if 'Closed' in hours:  # Handle "Closed" entries
        return pd.Series([None, None])
    
    # Regular expression to match time ranges (e.g., 5:00 PM–1:00 AM)
    time_pattern = r'(\d{1,2}:\d{2}\s?[APMapm]{2})\s?[–-]\s?(\d{1,2}:\d{2}\s?[APMapm]{2})'

    # Find all time ranges in the string
    times = re.findall(time_pattern, hours)
    
    if times:
        open_time = times[0][0]
        close_time = times[0][1]
        
        # If the open time is missing AM/PM, infer it from the close time
        if not re.search(r'[APMapm]{2}', open_time):  # If AM/PM is missing in open time
            open_time = infer_am_pm(open_time, close_time)
        
        # Convert times to 24-hour format
        open_time_24 = convert_to_24hr(open_time)
        close_time_24 = convert_to_24hr(close_time)
        
        return pd.Series([open_time_24, close_time_24])
    
    # Regular expression go match time ranges (e.g., 5:00–11:00 PM)
    time_pattern2 = r'(\d{1,2}:\d{2})\s?[–-]\s?(\d{1,2}:\d{2}\s?[APMapm]{2})'
    times2 = re.findall(time_pattern2, hours)

    if times2:
        open_time = times2[0][0]
        close_time = times2[0][1]
        
        # If the open time is missing AM/PM, infer it from the close time
        if not re.search(r'[APMapm]{2}', open_time):
            open_time = infer_am_pm(open_time, close_time)
        
        # Convert times to 24-hour format
        open_time_24 = convert_to_24hr(open_time)
        close_time_24 = convert_to_24hr(close_time)
        
        return pd.Series([open_time_24, close_time_24])
        
    return pd.Series([None, None])

def get_location_info(address):
    url = 'https://nominatim.openstreetmap.org/search'
    params = {
        'q': address,
        'format': 'json',
        'addressdetails': 1,  # Get detailed address components
        'limit': 1,  # Only return the top result
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            address_components = data[0]['address']
            
            # Extract district and locality (if available)
            district = address_components.get('city_district') or address_components.get('suburb') or address_components.get('county')
            
            return district
        else:
            return None
    else:
        return None

# compute distances between restaurants using latitude and longitude 
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers

    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c

    return distance

def check_closed(row):
    if row['visit_day'] in row:
        if 'Closed' in row[row['visit_day']]:
            return True
    return False

# Function to clean and convert review ratings
def process_reviews(review_string):
    try:
        # Remove unwanted characters and split by commas
        cleaned_ratings = review_string.replace('[', '').replace(']', '').split(',')
        # Convert strings to floats
        numeric_ratings = [float(i.strip()) for i in cleaned_ratings]
        # Sort, take the top 3, and calculate the mean
        return round(np.mean(sorted(numeric_ratings, reverse=True)[:3]),2)
    except ValueError:
        # Handle cases where conversion fails
        return np.nan

In [3]:
# read csv data and convert columns to lower case
historical_data = pd.read_csv('datasets/random_user_historical_data.csv')
historical_data.columns = historical_data.columns.str.lower()
historical_data.columns = historical_data.columns.str.replace(' ', '_')

In [4]:
historical_data.head()

Unnamed: 0,user_id,place_id,name,summary,price_level,address,latitude,longitude,overall_rating,opening_hours,total_reviews,reviews,review_ratings,user_ratings,visit_date,visit_time
0,1,ChIJLT1gFNtRqEcRUIdXWAGRLB8,Restaurant Austernbank,"Upscale seafood plates presented in a grand, s...",,"Behrenstraße 42, Berlin",52.515929,13.391821,4.3,"Monday: Closed, Tuesday: 5:00 PM – 1:00 AM, We...",323,"[""Was ok, I guess...\nI'd say everything was a...","[4, 5, 2]",1,2023-12-10 20:34:25.536555,14:02:46
1,1,ChIJOeDFuFtOqEcRE8S2p2Wcu-c,Trattoria Libau,Unassuming operation with an old-school vibe d...,2.0,"Libauer Straße 10, Berlin-Bezirk Friedrichshai...",52.508665,13.452836,4.5,"Monday: 4:00 PM – 12:00 AM, Tuesday: 4:00 PM –...",1116,['Good place for a quick Italian food. The piz...,"[4, 5, 5]",5,2024-07-27 20:34:25.536559,17:40:19
2,1,ChIJQUz4iklQqEcRoR2HY1JwkQE,Fadi Food,Unpretentious restaurant serving shawarma wrap...,1.0,"Potsdamer Straße 117, Berlin-Bezirk Tempelhof-...",52.498779,13.362772,4.2,"Monday: 7:00 AM – 11:00 PM, Tuesday: 7:00 AM –...",840,['Amazingly decent food that tastes exactly li...,"[5, 5, 5]",4,2023-09-24 20:34:25.536540,18:57:03
3,1,ChIJrfRNQslPqEcRPuCFqF0-DMI,St. Bart,Bar & kitchen with simple tiled décor plating ...,2.0,"Graefestraße 71, Berlin-Bezirk Friedrichshain-...",52.49192,13.417182,4.5,"Monday: 6:00 – 11:30 PM, Tuesday: 6:00 – 11:30...",664,['Really tasty meal and superb experience! All...,"[5, 4, 3]",3,2024-06-25 20:34:25.536563,10:54:32
4,2,ChIJ4Vof-khQqEcR1CPHIeU2ud4,Nafis Restaurant,"This small, modest Persian spot with tradition...",2.0,"Winterfeldtstraße 11, Berlin-Bezirk Tempelhof-...",52.49614,13.359165,4.3,"Monday: Closed, Tuesday: 11:30 AM – 10:30 PM, ...",639,"[""Super fair prices for tasty, authentic Irani...","[5, 4, 5]",5,2023-10-07 20:34:25.536566,13:08:21


In [5]:
# check column data types and missing values
historical_data.info()

# check for number of rows 
print(historical_data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         356 non-null    int64  
 1   place_id        356 non-null    object 
 2   name            356 non-null    object 
 3   summary         356 non-null    object 
 4   price_level     321 non-null    float64
 5   address         356 non-null    object 
 6   latitude        356 non-null    float64
 7   longitude       356 non-null    float64
 8   overall_rating  356 non-null    float64
 9   opening_hours   346 non-null    object 
 10  total_reviews   356 non-null    int64  
 11  reviews         356 non-null    object 
 12  review_ratings  356 non-null    object 
 13  user_ratings    356 non-null    int64  
 14  visit_date      356 non-null    object 
 15  visit_time      356 non-null    object 
dtypes: float64(4), int64(3), object(9)
memory usage: 44.6+ KB
(356, 16)


In [6]:
# check for missing values
historical_data.isnull().sum()

user_id            0
place_id           0
name               0
summary            0
price_level       35
address            0
latitude           0
longitude          0
overall_rating     0
opening_hours     10
total_reviews      0
reviews            0
review_ratings     0
user_ratings       0
visit_date         0
visit_time         0
dtype: int64

In [7]:
# no. of unique users and values 
print('no of unique users: ', historical_data['user_id'].nunique())
print('no of unique restaurants: ', historical_data['place_id'].nunique())

# no. of restaurant duplicates
print('no. of restaurant duplicates: ', historical_data.duplicated(subset=['place_id']).sum())


no of unique users:  100
no of unique restaurants:  177
no. of restaurant duplicates:  179


In [8]:
# convert visit date to datetime format
historical_data['visit_date'] = pd.to_datetime(historical_data['visit_date'])

# extract day of visit
historical_data['visit_day'] = historical_data['visit_date'].dt.day_name()

# convert visit date to date format
historical_data['visit_date'] = historical_data['visit_date'].dt.date


In [9]:
historical_data.head()

Unnamed: 0,user_id,place_id,name,summary,price_level,address,latitude,longitude,overall_rating,opening_hours,total_reviews,reviews,review_ratings,user_ratings,visit_date,visit_time,visit_day
0,1,ChIJLT1gFNtRqEcRUIdXWAGRLB8,Restaurant Austernbank,"Upscale seafood plates presented in a grand, s...",,"Behrenstraße 42, Berlin",52.515929,13.391821,4.3,"Monday: Closed, Tuesday: 5:00 PM – 1:00 AM, We...",323,"[""Was ok, I guess...\nI'd say everything was a...","[4, 5, 2]",1,2023-12-10,14:02:46,Sunday
1,1,ChIJOeDFuFtOqEcRE8S2p2Wcu-c,Trattoria Libau,Unassuming operation with an old-school vibe d...,2.0,"Libauer Straße 10, Berlin-Bezirk Friedrichshai...",52.508665,13.452836,4.5,"Monday: 4:00 PM – 12:00 AM, Tuesday: 4:00 PM –...",1116,['Good place for a quick Italian food. The piz...,"[4, 5, 5]",5,2024-07-27,17:40:19,Saturday
2,1,ChIJQUz4iklQqEcRoR2HY1JwkQE,Fadi Food,Unpretentious restaurant serving shawarma wrap...,1.0,"Potsdamer Straße 117, Berlin-Bezirk Tempelhof-...",52.498779,13.362772,4.2,"Monday: 7:00 AM – 11:00 PM, Tuesday: 7:00 AM –...",840,['Amazingly decent food that tastes exactly li...,"[5, 5, 5]",4,2023-09-24,18:57:03,Sunday
3,1,ChIJrfRNQslPqEcRPuCFqF0-DMI,St. Bart,Bar & kitchen with simple tiled décor plating ...,2.0,"Graefestraße 71, Berlin-Bezirk Friedrichshain-...",52.49192,13.417182,4.5,"Monday: 6:00 – 11:30 PM, Tuesday: 6:00 – 11:30...",664,['Really tasty meal and superb experience! All...,"[5, 4, 3]",3,2024-06-25,10:54:32,Tuesday
4,2,ChIJ4Vof-khQqEcR1CPHIeU2ud4,Nafis Restaurant,"This small, modest Persian spot with tradition...",2.0,"Winterfeldtstraße 11, Berlin-Bezirk Tempelhof-...",52.49614,13.359165,4.3,"Monday: Closed, Tuesday: 11:30 AM – 10:30 PM, ...",639,"[""Super fair prices for tasty, authentic Irani...","[5, 4, 5]",5,2023-10-07,13:08:21,Saturday


In [10]:
# Extract Opening Hours By Day 
historical_data[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']] = historical_data['opening_hours'].str.split(r'[a-zA-Z]+:', expand=True).drop(0, axis=1).replace(',', ' ')
historical_data.drop('opening_hours', axis=1, inplace=True)

for col in ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']:
    historical_data[col] = historical_data[col].str.replace(',', '')

In [11]:
for col in ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']:
    historical_data[[col + '_open', col + '_close']] = historical_data[col].apply(extract_open_close_times)

historical_data.drop(['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun'], axis=1, inplace=True)


In [12]:
# assign closed to the missing values in opening_hours
for col in ['mon_open', 'tues_open', 'wed_open', 'thurs_open', 'fri_open', 'sat_open', 'sun_open', 'mon_close', 'tues_close', 'wed_close', 'thurs_close', 'fri_close', 'sat_close', 'sun_close']:
    historical_data[col] = np.where(historical_data[col].isna(), 'closed', historical_data[col])

In [13]:
# using OpenStreetMaps to determine the district of each restaurant
unique_address = set(historical_data['address'])
unique_address_list = list(unique_address)
district_list = []

for restaurant in unique_address_list:
    district = get_location_info(restaurant)
    district_list.append(district)

address_district_dict = {address: district for address, district in zip(unique_address_list, district_list)}

# if the word 'Berlin-Bezirk' exists in the key of address_district_dict and v is empty, extract the word after 'Berlin-Bezirk' and assign it to the value of the key
for k,v in address_district_dict.items():    
    if v is None:
        if 'Berlin-Bezirk' in k:
            address_district_dict[k] = k.split('Berlin-Bezirk')[1].strip()

# assign the district to the address in the historical_data dataframe
historical_data['district'] = historical_data['address'].map(address_district_dict)

# if you see 'Berlin Hauptbahnhof' in the address, assign 'Mitte' to the district
historical_data.loc[historical_data['address'].str.contains('Greifswalder Straße'), 'district'] = 'Pankow'

# fill remaining missing values with 'Mitte'
historical_data['district'].fillna('Mitte', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  historical_data['district'].fillna('Mitte', inplace=True)


In [14]:
historical_data['visit_day'] = historical_data['visit_day'].astype('string')

In [15]:
# extract cuisine type from summary column 
summary_list = historical_data['summary'].tolist()
summary_list = [text for text in summary_list if text]

In [None]:
# List of possible cuisines
candidate_labels = ["Italian", "Middle Eastern", "Vietnamese", "Turkish", "Persian", "Argentinian", "Bar", "Asian", "Breakfast", "Cafe", "Balkan", "Korean", "Chinese", "Mediterranean", "Thai", "Mexican", "German", "American", "French", "Japanese", "Indian"]

# Load the zero-shot classification pipeline
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")
output_list = []
for text in summary_list:
    try:
        output = zeroshot_classifier(text, candidate_labels, multi_label=True)
        output_list.append(output)
    except Exception as e:
        print(f"Error processing text: {text}\nError: {e}")

2024-10-14 15:20:41.323008: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
All PyTorch model weights were used when initializing TFDebertaV2ForSequenceClassification.

All the weights of TFDebertaV2ForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForSequenceClassification for predictions without further training.


In [None]:
# print top two labels and summary and score 
for output, summary in zip(output_list, summary_list):
    print(f"Summary: {summary}")
    for i in range(2):
        print(f"Label: {output['labels'][i]}, Score: {output['scores'][i]}")
    print("\n")

In [None]:
# extract the top two labels and their scores
summary_cuisine_dict = {}

for output, summary in zip(output_list, summary_list):
    if output['scores'][0] >= 0.7 or output['scores'][1] >= 0.7:
        if output['scores'][0] > output['scores'][1]:
            summary_cuisine_dict[output['sequence']] = output['labels'][0]
            # summary_cuisine_dict[output['sequence']] = [output['labels'][0], round(output['scores'][0],3)]
        elif output['scores'][1] > output['scores'][0]:
            summary_cuisine_dict[output['sequence']] = output['labels'][1]
    else:
        summary_cuisine_dict[output['sequence']] = 'None'

summary_cuisine_dict


In [85]:
historical_data['cuisine'] = historical_data['summary'].map(summary_cuisine_dict)

In [86]:
historical_data.to_csv('new_cleaned_data.csv', index=False)