## Restaurant Review Project

#### My project analyses restaurant items, prices, and other descriptive elements to find patterns in vegan/vegitarian offerings. 

In [202]:
# Import via condas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Datafiniti Vegtarian and Vegan Restaurants.csv

In [203]:
# Load and examine the dataset
Restaurant_data_df = pd.read_csv("Datafiniti_Vegetarian_and_Vegan_Restaurants.csv")
Restaurant_data_df.columns


Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'claimed', 'country', 'cuisines',
       'descriptions.dateSeen', 'descriptions.sourceURLs',
       'descriptions.value', 'facebookPageURL', 'features.key',
       'features.value', 'hours.day', 'hours.dept', 'hours.hour', 'imageURLs',
       'isClosed', 'keys', 'languagesSpoken', 'latitude', 'longitude',
       'menuPageURL', 'menus.amountMax', 'menus.amountMin', 'menus.category',
       'menus.currency', 'menus.dateSeen', 'menus.description', 'menus.name',
       'menus.sourceURLs', 'name', 'paymentTypes', 'phones', 'postalCode',
       'priceRangeCurrency', 'priceRangeMin', 'priceRangeMax', 'province',
       'sic', 'sourceURLs', 'twitter', 'websites', 'yearOpened'],
      dtype='object')

### Removing Columns

In [204]:
# Drop columns for URLs
def drop_urls(dataframe):
    url_columns = [col for col in dataframe.columns if 'URL' in col]
    dataframe.drop(columns=url_columns, inplace=True)
    return dataframe

Restaurant_data_df = drop_urls(Restaurant_data_df)
Restaurant_data_df.columns

Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'claimed', 'country', 'cuisines',
       'descriptions.dateSeen', 'descriptions.value', 'features.key',
       'features.value', 'hours.day', 'hours.dept', 'hours.hour', 'isClosed',
       'keys', 'languagesSpoken', 'latitude', 'longitude', 'menus.amountMax',
       'menus.amountMin', 'menus.category', 'menus.currency', 'menus.dateSeen',
       'menus.description', 'menus.name', 'name', 'paymentTypes', 'phones',
       'postalCode', 'priceRangeCurrency', 'priceRangeMin', 'priceRangeMax',
       'province', 'sic', 'twitter', 'websites', 'yearOpened'],
      dtype='object')

In [205]:
# Calculate fraction of missing values
missing_fraction = Restaurant_data_df.isnull().mean()

# Select columns where more than 80% of data is missing
columns_to_drop = missing_fraction[missing_fraction > 0.8].index

columns_to_drop


Index(['claimed', 'descriptions.dateSeen', 'descriptions.value',
       'features.key', 'features.value', 'hours.day', 'hours.dept',
       'hours.hour', 'isClosed', 'languagesSpoken', 'sic', 'twitter',
       'yearOpened'],
      dtype='object')

In [206]:
# Drop columns with more than 80% missing values
Restaurant_data_df.drop(columns=columns_to_drop, inplace=True)

# Check the remaining columns
Restaurant_data_df.columns


Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'country', 'cuisines', 'keys', 'latitude',
       'longitude', 'menus.amountMax', 'menus.amountMin', 'menus.category',
       'menus.currency', 'menus.dateSeen', 'menus.description', 'menus.name',
       'name', 'paymentTypes', 'phones', 'postalCode', 'priceRangeCurrency',
       'priceRangeMin', 'priceRangeMax', 'province', 'websites'],
      dtype='object')

### Exploring the remaining data
#### Payment Types Standardization - one of the messiest columns

In [207]:
Restaurant_data_df['paymentTypes'].unique()

array([nan, 'American Express,VISA', 'Cash', 'American Express', 'AMEX',
       'Mastercard,Visa,American Express,Diners Club',
       'American Express,Mastercard,Visa', 'Visa,MasterCard',
       'master card,amex,visa',
       'American Express,Diners Club,Mastercard,Visa',
       'Visa,American Express',
       'amex,discover,master card,visa,Mastercard,Discover,Visa,American Express',
       'American Express,Visa,Discover,Cash,Mastercard',
       'discover,visa,amex,mastercard,MasterCard',
       'Mastercard,Discover,Visa,American Express,Check',
       'Mastercard,Visa', 'Mastercard,Discover,Visa,American Express',
       'Discover,Visa',
       'American Express,Diners Club,Discover,Mastercard,Visa',
       'Mastercard,Visa,American Express',
       'Mastercard,Discover,Visa,American Express,Diners Club,diners club',
       'amex', 'American Express,Mastercard,Visa,discover', 'mastercard',
       'Mastercard',
       'Mastercard,Discover,Visa,American Express,Diners Club',
     

In [208]:
#function to clean the paymentTypes column
def clean_multi_value_column(Restaurant_data_df, paymentTypes):
    """
    Cleans a multi-value string column (paymentTypes) by:
    1. Lowercasing all values.
    2. Standardizing common variations.
    3. Splitting values into lists.
    4. Removing duplicates and sorting within each list.
    5. Creating a clean string version for easy viewing.
    """
    
    # Step 1: Lowercase everything
    Restaurant_data_df[paymentTypes] = Restaurant_data_df[paymentTypes].str.lower()
    
    # Step 2: Standardize common variations
    standardization_dict = {
        'amex': 'american express',
        'master card': 'mastercard',
        'diners club': 'diners club',
        'check': 'check',
        'debit card': 'debit card'
    }
    Restaurant_data_df[paymentTypes] = Restaurant_data_df[paymentTypes].replace(standardization_dict, regex=True)
    
    # Step 3: Split the string into lists
    Restaurant_data_df[paymentTypes + '_list'] = Restaurant_data_df[paymentTypes].str.split(',')
    
    # Step 4: Remove duplicates and sort alphabetically within each cell
    Restaurant_data_df[paymentTypes + '_list'] = Restaurant_data_df[paymentTypes + '_list'].apply(
        lambda x: sorted(set([item.strip() for item in x])) if isinstance(x, list) else x
    )
    
    # Step 5: Optional - create a cleaned string version for easy viewing
    Restaurant_data_df[paymentTypes + '_clean'] = Restaurant_data_df[paymentTypes + '_list'].apply(
        lambda x: ', '.join(x) if isinstance(x, list) else x
    )
    
    return Restaurant_data_df

# Apply the function
Restaurant_data_df = clean_multi_value_column(Restaurant_data_df, 'paymentTypes')

# View the cleaned results
Restaurant_data_df[['paymentTypes', 'paymentTypes_clean']].sample(10)


Unnamed: 0,paymentTypes,paymentTypes_clean
6204,"mastercard,visa","mastercard, visa"
1403,"mastercard,visa,american express,diners club","american express, diners club, mastercard, visa"
2044,,
6561,"american express,discover,mastercard,visa,amer...","american express, discover, mastercard, visa"
8648,"american express,debit card,discover,mastercard","american express, debit card, discover, master..."
2545,,
335,,
669,cash,cash
1739,,
6751,"american express,discover,mastercard,visa","american express, discover, mastercard, visa"


## Breakdown the further exploration needs of the data set

In [209]:
# Export the dataframe to a new CSV
'''
Restaurant_data_df.to_csv('Restaurant_data_cleaned.csv', index=False)
''' #checking things out in excel
Restaurant_data_df.columns


Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'country', 'cuisines', 'keys', 'latitude',
       'longitude', 'menus.amountMax', 'menus.amountMin', 'menus.category',
       'menus.currency', 'menus.dateSeen', 'menus.description', 'menus.name',
       'name', 'paymentTypes', 'phones', 'postalCode', 'priceRangeCurrency',
       'priceRangeMin', 'priceRangeMax', 'province', 'websites',
       'paymentTypes_list', 'paymentTypes_clean'],
      dtype='object')

In [210]:
''' Renaming columns in excel and notes
'id', -- Capitalized in excel, clean
'dateAdded', -- would like to shorten format to date only in python
'dateUpdated', -- would like to shorten format to date only in python
'address', -- Capitalized in excel, clean
'categories', -- Would like to split into individual columns or sub columns in python
'primaryCategories', -- Renamed to "Service Category" in excel, wouuld like to split into individual columns or sub columns in python
'city', -- Capitalized in excel, clean
'country', -- deleted in excel
'cuisines', Capitalized in excel, seperate columns in python?
'keys', -- Capitalized in excel, clean
'latitude' -- decimal formatted in excel
'longitude', decimal formatted in excel
'menus.amountMax', -- decimal formatted in excel($)
'menus.amountMin', -- decimal formatted in excel($)
'menus.category', -- Renamed to "Menu Category" in excel, needs cleaning in python
'menus.currency', -- deleted in excel
'menus.dateSeen', Capitalized in excel,
'menus.description', Capitalized in excel,
'menus.name', Renamed to "Menu Item Name" in excel, clean
'name', -- Renamed to "Restaurant Name" in excel, clean
'paymentTypes', --renamed to "Payment Types" in excel, clean (seperate columns in python?)
'phones', --renamed to "Phone Number", change to boolean in python
'postalCode', -- Capitalized in excel, mostly clean
'priceRangeCurrency', -- deleted in excel
'priceRangeMin', -- deleted in excel
'priceRangeMax', -- decimal formatted in excel($), Renamed "MenuItemPrice"
'province', -- Renamed to "State" in excel, clean
'websites', -- Change to boolean in python
'paymentTypes_list', -- my columns
'paymentTypes_clean' -- my columns
'''
# Three columns deleted in excel: country, priceRangeCurrency, menus.currency
## ()-identifer, []-online related, $-ready to use, *Explore/Clean, ^-boolean
# New Names -(ID) [DateAdded] [DateUpdated]
# Address$	Categories*	Service Category*	
# City$ 	Cuisines*	(Key)	
# Latitude$ 	Longitude$	 MenuItemPrice$ 	
# MenuCategory*	[MenuDateSeen]	MenuDescription* MenuItemName*	
# RestaurantName$	PaymentTypes*  PhoneNumber$	PostalCode$	
# PriceRangeMin$	PriceRangeMax$	State$	Website^
# paymentTypes_list	paymentTypes_clean	


' Renaming columns in excel and notes\n\'id\', -- Capitalized in excel, clean\n\'dateAdded\', -- would like to shorten format to date only in python\n\'dateUpdated\', -- would like to shorten format to date only in python\n\'address\', -- Capitalized in excel, clean\n\'categories\', -- Would like to split into individual columns or sub columns in python\n\'primaryCategories\', -- Renamed to "Service Category" in excel, wouuld like to split into individual columns or sub columns in python\n\'city\', -- Capitalized in excel, clean\n\'country\', -- deleted in excel\n\'cuisines\', Capitalized in excel, seperate columns in python?\n\'keys\', -- Capitalized in excel, clean\n\'latitude\' -- decimal formatted in excel\n\'longitude\', decimal formatted in excel\n\'menus.amountMax\', -- decimal formatted in excel($)\n\'menus.amountMin\', -- decimal formatted in excel($)\n\'menus.category\', -- Renamed to "Menu Category" in excel, needs cleaning in python\n\'menus.currency\', -- deleted in excel\

## At this point I have - 
Explored data, done preliminary cleaning, renamed columns, deleted unusable or unnessary columns, and assessed further steps. 
From here I will - 
- Clean string columns
- Convert dates to date only
- Clean multi-value columns 
- Switch some data to Boolean
- Validate latitude/longitude
- Remove duplicates, and 
- extract the file again.
- *some of this has already been done with our payment info. 

In [211]:
Restaurant_data_cleaned_df = pd.read_csv("Restaurant_data_cleaned.csv")
Restaurant_data_cleaned_df.columns
#the remaining columns are ready for further cleaning.

Index(['ID', 'DateAdded', 'DateUpdated', 'Address', 'Categories',
       'Service Category', 'City', 'Cuisines', 'Key', 'Latitude', 'Longitude',
       ' MenuItemPrice ', 'MenuCategory', 'MenuDateSeen', 'MenuDescription',
       'MenuItemName', 'RestaurantName', 'PaymentTypes', 'PhoneNumber',
       'PostalCode', ' PriceRangeMin ', ' PriceRangeMax ', 'State', 'Website',
       'paymentTypes_list', 'paymentTypes_clean'],
      dtype='object')

In [212]:
# Phone Number and Website columns converted to boolean so we can asses them from a yes/no perspective
Restaurant_data_cleaned_df["HasPhone"] = (
    Restaurant_data_cleaned_df["PhoneNumber"].notna()
)

Restaurant_data_cleaned_df["HasWebsite"] = (
    Restaurant_data_cleaned_df["Website"].notna()
)

In [213]:
# Validate Latitude and Longitude columns
Restaurant_data_cleaned_df = Restaurant_data_cleaned_df[
    Restaurant_data_cleaned_df["Latitude"].between(-90, 90) &
    Restaurant_data_cleaned_df["Longitude"].between(-180, 180)
]


In [214]:
# Remove exact duplicates
Restaurant_data_cleaned_df = Restaurant_data_cleaned_df.drop_duplicates()

In [215]:
# Convert datetime to date only
Restaurant_data_cleaned_df["DateAdded"] = (
    pd.to_datetime(Restaurant_data_cleaned_df["DateAdded"], errors="coerce")
    .dt.date
)

Restaurant_data_cleaned_df["DateUpdated"] = (
    pd.to_datetime(Restaurant_data_cleaned_df["DateUpdated"], errors="coerce")
    .dt.date
)



In [216]:
# Cleaning further string columns
string_columns = Restaurant_data_cleaned_df.select_dtypes(include="object").columns

for col in string_columns:
    Restaurant_data_cleaned_df[col] = (
        Restaurant_data_cleaned_df[col]
        .astype(str)
        .str.replace("ampamp", "&", regex=False)
        .str.replace("&amp;", "&", regex=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
        .replace("nan", pd.NA)
    )


In [217]:
def normalize_multivalue_column(series):
    cleaned = (
        series
        .str.lower()
        .str.replace(r"\s*,\s*", ", ", regex=True)
        .str.replace(",+", ",", regex=True)
        .str.strip(" ,")
    )

    return cleaned.apply(
        lambda x: ", ".join(dict.fromkeys(x.split(", "))) if pd.notna(x) else x
    )


Restaurant_data_cleaned_df["Categories_clean"] = normalize_multivalue_column(
    Restaurant_data_cleaned_df["Categories"]
)
Restaurant_data_cleaned_df["Categories_list"] = (
    Restaurant_data_cleaned_df["Categories_clean"].str.split(", ")
)


Restaurant_data_cleaned_df["ServiceCategory_clean"] = normalize_multivalue_column(
    Restaurant_data_cleaned_df["Service Category"]
)
Restaurant_data_cleaned_df["ServiceCategory_list"] = (
    Restaurant_data_cleaned_df["ServiceCategory_clean"].str.split(", ")
)


Restaurant_data_cleaned_df["Cuisines_clean"] = normalize_multivalue_column(
    Restaurant_data_cleaned_df["Cuisines"]
)
Restaurant_data_cleaned_df["Cuisines_list"] = (
    Restaurant_data_cleaned_df["Cuisines_clean"].str.split(", ")
)


Restaurant_data_cleaned_df["PaymentTypes_clean"] = normalize_multivalue_column(
    Restaurant_data_cleaned_df["PaymentTypes"]
)
Restaurant_data_cleaned_df["PaymentTypes_list"] = (
    Restaurant_data_cleaned_df["PaymentTypes_clean"].str.split(", ")
)


Rechecking work in Excel - deleted excess columns and keeping cleaned, list columns.
Seeing further issues in - 'menu description' and 'menu item name' with encoding leakage

In [218]:
final_cleaned_df = pd.read_csv("Restaurant_data_cleaned_final.csv")

In [219]:

def decode_text(series):
    cleaned = (
        series.astype(str)
        # Fix common HTML-style encodings
        .str.replace("ampcomma", ",", regex=False)
        .str.replace("ampapos", "'", regex=False)
        .str.replace("amp39", "'", regex=False)
        .str.replace("ampquot", '"', regex=False)
        .str.replace("ampamp", "&", regex=False)
        # Remove any remaining 'amp'
        .str.replace("amp", "", regex=False)
        # Normalize punctuation spacing
        .str.replace(r"\s*,\s*", ", ", regex=True)
        .str.replace(r'\s*"\s*', '"', regex=True)
        .str.replace(r"\s*'\s*", "'", regex=True)
        .str.replace(r"\s*&\s*", " & ", regex=True)
        # Collapse extra spaces
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
        # Replace literal 'nan' with pd.NA
        .replace("nan", pd.NA)
    )
    return cleaned

# Apply to your DataFrame
final_cleaned_df["MenuDescription"] = decode_text(
    final_cleaned_df["MenuDescription"]
)
final_cleaned_df["MenuItemName"] = decode_text(
    final_cleaned_df["MenuItemName"]
)


In [220]:
final_cleaned_df["MenuItemName"].sample(15)

7790                    Tokyo Supergreens Wrap Tofu
1277                       Jacopo Poli Di Secca 40%
2791                         Sesame Ginger Dressing
619                                           Humus
5123                       12 oz Can San Pellegrino
111                       Baked Macaroni and Cheese
5225                                      Pepperoni
7693                              Chia Seed Pudding
2120                                          Juice
6829                              Vegetable Samosas
4090                                     Lean Green
6522                                     Dal Makhni
8224                                    Mango Lassi
2843    Bowl of Soup w toasted organic 9grain bread
3011                                       Two Eggs
Name: MenuItemName, dtype: object

In [223]:
final_cleaned_df.columns
# Export the final cleaned DataFrame to a new CSV
final_cleaned_df.to_csv('Vegan_Vegetarian_Restaurants.csv', index=False)