In [None]:
import pandas as pd
import re
pd.options.display.max_rows = 600

# Load csv files
df1 = pd.read_csv("one-star-michelin-restaurants.csv")
df2 = pd.read_csv("two-stars-michelin-restaurants.csv")
df3 = pd.read_csv("three-stars-michelin-restaurants.csv")

# Add rating column for each data frame 
df1["rating"], df2["rating"], df3["rating"] = 1, 2, 3

# Merge data frames into single data frame
df = pd.concat([df1, df2, df3])

# Remove year and zip code
df.drop(["year", "zipCode"], axis="columns", inplace=True)

# Translate accent characters into English characters in name and city column
df["name"] = df["name"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df["city"] = df["city"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Default empty city values to its region
df["city"].fillna(df["region"], inplace=True)

# Remove numeric portion of city from Rio de Janeiro & Sao Paulo and remove substring before "/" in city
df["city"] = [re.sub(r'[^A-Za-z /-]+', "", x).rstrip(" -") for x in df["city"]]
df["city"] = [city.split("/")[1] if "/" in city else city for city in df["city"]]

# Replace N/A values with empty string
df["price"].fillna("na", inplace=True)
# Convert price to integer 1-5 or N/A
df["price"] = [len(str(price)) if price != "na" else "na" for price in df["price"]]

# Drop unnecessary/filler words in cuisine column
postfix_pattern = '|'.join([' cuisine', ' contemporary', ' Roast Meats', ' influences', ' and grills', ' and congee'])
prefix_pattern = '|'.join(['Modern ', 'Traditional ', 'Creative ', 'Classic ', ' Contemporary', 'Southern '])
df["cuisine"] = df["cuisine"].str.replace(postfix_pattern, "").str.replace(prefix_pattern, "")

# Convert column values to lowercase and remove punctuations
for columns in df[['name', 'city', 'region', 'cuisine']]:
    df[columns] = df[columns].str.replace(',', '').str.replace('.', '')
    df[columns] = df[columns].str.replace('[^a-zA-Z0-9\n]', ' ') # replaces a nonalphanumerical charac with a white space 
    df[columns] = df[columns].str.replace('\s+',' ') # limits the white space to only one
    df[columns] = df[columns].str.replace(' ', '_').str.lower()

# Export data frame to csv
df.to_csv("all-michelin-restaurants-cleaned.csv", index=False)


Read in CSV files of Michelin Star ratings, convert data into Prolog facts, and write as a Prolog file.
Prolog Fact Structure:

restaurant(Name, Lat,Long, City, Region, Cuisine, Url, Price, Rating).
Example:

restaurant(kilian_stuba,47.34858,10.17114,kleinwalsertal,austria,creative,5,'https://guide.michelin.com/at/en/vorarlberg/kleinwalsertal/restaurant/kilian-stuba',1).

In [None]:
# Read temp csv file and write csv rows as Prolog Facts to a .pl file. 

df = pd.read_csv("Datasets/all-michelin-restaurants-cleaned-v2.csv")
with open('michelin-restaurants.pl', 'w') as outFile:
     for row in df.itertuples():
         outFile.write("restaurant(" + str(row.name) + "," + str(row.latitude) + "," + str(row.longitude) + "," + str(row.city) + "," + str(row.region) 
         + "," + str(row.cuisine) + "," + str(row.price) + ",'" + str(row.url) + "'," + str(row.rating) + ").\n")