In [1]:
# Import libraries
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
# Load the data
# Yelp business dataset
df = pd.read_json('yelp_academic_dataset_business.json', lines=True)
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [3]:
# Filter rows where the 'categories' column contains the word 'Restaurants'
restaurant_df = df[df['categories'].str.contains('Restaurants', case=False, na=False)]

# Display the first few rows of the DataFrame to verify that only restaurants are included
restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."


In [5]:
# Iterate over the 'attributes' column and collect all unique keys
unique_attributes = set()
for attributes in df['attributes'].dropna():
    unique_attributes.update(attributes.keys())

# Print the unique attribute keys, one per line
unique_attributes_list = list(unique_attributes)
for attribute in unique_attributes_list:
    print(attribute)

BusinessParking
CoatCheck
RestaurantsTableService
BusinessAcceptsBitcoin
GoodForKids
DietaryRestrictions
Open24Hours
DogsAllowed
GoodForDancing
RestaurantsAttire
BYOB
HasTV
RestaurantsGoodForGroups
BestNights
Corkage
AcceptsInsurance
DriveThru
HairSpecializesIn
Ambience
RestaurantsDelivery
ByAppointmentOnly
BusinessAcceptsCreditCards
BikeParking
OutdoorSeating
Smoking
Alcohol
RestaurantsTakeOut
AgesAllowed
BYOBCorkage
NoiseLevel
RestaurantsPriceRange2
RestaurantsCounterService
Caters
RestaurantsReservations
WiFi
Music
WheelchairAccessible
GoodForMeal
HappyHour


In [6]:
# Create a function to interpret the p-value
def p_value_reader(p_value, alpha):
    if p_value < alpha:
        print("Reject the Null Hypothesis")
    else:
        print("Fail to reject the Null Hypothesis")

## Hypothesis 1: Restaurants that are open have better ratings that those that are not
**Null Hypothesis**: Open restaurant ratings <= Closed Restaurant ratings
**Alternative Hypothesis**: Open restaurants ratings  > Closed restaurant ratings

In [7]:
# Extract the 'stars' ratings for open and closed restaurants
open_rest_stars = restaurant_df[restaurant_df['is_open'] == 1]['stars']
closed_rest_stars = restaurant_df[restaurant_df['is_open'] == 0]['stars']

In [8]:
# Compare the mean
print(f"The star rating mean for open restaurants is {open_rest_stars.mean()}")
print(f"The star rating mean for closed restaurants is {closed_rest_stars.mean()}")

The star rating mean for open restaurants is 3.523894589418927
The star rating mean for closed restaurants is 3.4975117180718707


In [9]:
# Build a function that performs a 2-sample test based on the outcome of Levene's test
def test_2sample(sample1, sample2, alpha, alternative):
    # Perform Levene's test to check for equal variances
    stat, p_value = st.levene(sample1, sample2)

    # Interpret the result of Levene's test
    if p_value < alpha:
        equal_var = False
        print("Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test")
    else:
        equal_var = True
        print("Fail to reject the Null Hypothesis. Variances are equal. Perform 2-sample T-test")

    # Perform the 2-sample t-test
    t_statist, p_value = st.ttest_ind(sample1,
                                      sample2,
                                      equal_var = equal_var,
                                      alternative = alternative)

    # Print the p-value from the t-test
    print(f"The p-value is {p_value}")
    # Call a function to interpret the p-value (assumes p_value_reader is defined elsewhere)
    p_value_reader(p_value, alpha)

In [11]:
# Apply the function
test_2sample(open_rest_stars, closed_rest_stars, 0.05, 'greater')

Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test
The p-value is 0.00017650010900391988
Reject the Null Hypothesis


# Hypothesis 2: Restaurants that deliver food have worse ratings
**Null Hypothesis**: Restaurants that deliver food ratings >= restaurants that don't deliver food ratings **Alternative Hypothesis**: Restaurants that deliver food ratings < restaurants that don't deliver food ratings

In [12]:
# Create a copy of the df
df_h2 = restaurant_df.copy()

In [14]:
# Define a function to extract the 'RestaurantsDelivery' value
def is_delivery(attributes):
    if attributes and 'RestaurantsDelivery' in attributes:
        return attributes['RestaurantsDelivery'] == 'True'
    return False

# Apply the function to the 'attributes' column and create a new column 'delivers_food'
df_h2['delivers_food'] = df_h2['attributes'].apply(is_delivery)

In [15]:
# Extract the 'stars' ratings for delivery and non-delivery restaurants
delivery_stars = df_h2[df_h2['delivers_food'] == True]['stars']
non_delivery_stars = df_h2[df_h2['delivers_food'] == False]['stars']

In [16]:
# Apply the function
test_2sample(delivery_stars, non_delivery_stars, 0.05, 'less')

Reject the Null Hypothesis. Variances are unequal. Perform Welch's Test
The p-value is 3.1154302069405886e-227
Reject the Null Hypothesis


# Hypothesis 3: Restaurants that allow smoking are less likely to be open
**Null Hypothesis**: There is no relationship between the variables **Alternative Hypothesis**: Restaurants that allow smoking are less likely to be open

In [17]:
# Create a copy for the df
df_h3 = restaurant_df.copy()

In [18]:
# Create a new column 'allows_smoking' to indicate if smoking is allowed
# Check if 'attributes' column is not None and if 'Smoking' key exists with value 'True'
df_h3['allows_smoking'] = df_h3['attributes'].apply(lambda x: x.get('Smoking') == 'True' if x else False)

In [19]:
# Create a contingency table to observe the frequency distribution between the smoking
contingency_table = pd.crosstab(df_h3['allows_smoking'], df_h3['is_open'])
print(contingency_table)

is_open             0      1
allows_smoking              
False           17281  34987


In [21]:
_, p_value, _, _= st.chi2_contingency(observed = contingency_table)
p_value

1.0