# About
Contingency table and chi-quare analysis of restaurants in Team 7 dataset.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
import pingouin
import numpy as np

# Load dataset

In [None]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [None]:
restaurant_df.shape

In [None]:
categories_df = pd.read_json('yelp_team7_dataset_category.json')
categories_df.shape

In [None]:
restaurant_df = restaurant_df.merge(categories_df, on = 'business_id', how = 'left')

In [None]:
restaurant_df.shape

# Preliminary context

In [None]:
# How many restaurants were open?
restaurant_df.is_open.value_counts(dropna = False)

In [None]:
restaurant_df.is_open.value_counts(normalize = True).plot(kind = 'bar')
plt.show()

# Does restaurant survival in our sample depend on city?

In [None]:
city_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['city'])
city_contingency_table

In [None]:
ax = sns.heatmap(city_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. City (counts)")

In [None]:
city_contingency_norm_columns = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['city'],
                                          normalize = 'columns') * 100
city_contingency_norm_columns

In [None]:
ax = sns.heatmap(city_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. City (column normalized)")
#plt.show()
plt.savefig('heatmap_city_col.png', bbox_inches='tight')

In [None]:
city_contingency_norm_index = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['city'],
                                          normalize = 'index') * 100
city_contingency_norm_index

In [None]:
ax = sns.heatmap(city_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. City (index normalized)")
#plt.show()
plt.savefig('heatmap_city_index.png', bbox_inches='tight')

$H_0 : $ `is_open` is independent of `city`

$H_1$ : `is_open` is not independent of `city`

In [None]:
# chi-square test
round(chi2_contingency(city_contingency_table)[1],2)

Reject the null hypothesis. Restaurant survival status is not independent of city. 

# Does restaurant survival depend on cuisine type?

In [None]:
cuisine_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['cuisine_type'])
cuisine_contingency_table

Very few observations in `Other`, which will influence later analyses by normalization. Combine `Other` with `Other Asian`

In [None]:
restaurant_df.cuisine_type.value_counts(dropna = False)

In [None]:
cuisines_to_combine = ['Other', 'Other Asian']
condition = restaurant_df['cuisine_type'].isin(cuisines_to_combine)

In [None]:
restaurant_df.loc[condition, 'cuisine_type'] = 'Other Asian and Australian'

In [None]:
restaurant_df.cuisine_type.value_counts(dropna = False)

In [None]:
cuisine_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['cuisine_type'])
cuisine_contingency_table

In [None]:
ax = sns.heatmap(cuisine_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. Cuisine (counts)")

In [None]:
cuisine_contingency_norm_columns = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['cuisine_type'],
                                          normalize = 'columns') * 100
cuisine_contingency_norm_columns

In [None]:
ax = sns.heatmap(cuisine_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. Cuisine (column normalized)")
#plt.show()
plt.savefig('heatmap_cuisine_col.png', bbox_inches='tight')

In [None]:
cuisine_contingency_norm_index = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['cuisine_type'],
                                          normalize = 'index') * 100
cuisine_contingency_norm_index

In [None]:
ax = sns.heatmap(cuisine_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. Cuisine (index normalized)")
#plt.show()
plt.savefig('heatmap_cuisine_index.png', bbox_inches='tight')

$H_0 : $ `is_open` is independent of `cuisine_type`

$H_1$ : `is_open` is not independent of `cuisine_type`

In [None]:
# chi-square test
round(chi2_contingency(cuisine_contingency_table)[1],2)

Reject the null hypothesis that restaurant survival status is independent of cuisine type. 

# Does restaurant survival depend on food type?

In [None]:
food_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['food_type'])
food_contingency_table

In [None]:
ax = sns.heatmap(food_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. Food type (counts)")

In [None]:
food_contingency_norm_columns = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['food_type'],
                                          normalize = 'columns') * 100
food_contingency_norm_columns

In [None]:
ax = sns.heatmap(food_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. Food type (normalize by Food type)")

In [None]:
food_contingency_norm_index = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['food_type'],
                                          normalize = 'index') * 100
food_contingency_norm_index

In [None]:
ax = sns.heatmap(food_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. Food type (normalize by survival status)")

$H_0 : $ `is_open` is independent of `food_type`

$H_1$ : `is_open` is not independent of `food_type`

In [None]:
# chi-square test
round(chi2_contingency(food_contingency_table)[1],2)

Reject the null hypothesis that restaurant survival status is independent of cuisine type. 

# Does restaurant survival depend on atmosphere type?

In [None]:
columns = ['business_id', 'is_open', 'cuisine_type', 'atmosphere_type', 'food_type', 'dietary_type', 'hi_transmission_risk']
subset_df = restaurant_df[columns]

In [None]:
#subset_df.loc[subset_df.atmosphere_type.isna(), 'atmosphere_type'] = 'unidentified'

In [None]:
#subset_df.atmosphere_type.value_counts()

In [None]:
atmosphere_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['atmosphere_type'])
atmosphere_contingency_table

In [None]:
ax = sns.heatmap(atmosphere_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. Atmosphere type (counts)")

In [None]:
atmosphere_contingency_norm_columns = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['atmosphere_type'],
                                          normalize = 'columns') * 100
atmosphere_contingency_norm_columns

In [None]:
ax = sns.heatmap(atmosphere_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. Atmosphere type (normalize by Atmosphere type)")

In [None]:
atmosphere_contingency_norm_index = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['atmosphere_type'],
                                          normalize = 'index') * 100
atmosphere_contingency_norm_index

In [None]:
ax = sns.heatmap(atmosphere_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. Atmosphere type (normalize by survival status)")

$H_0 : $ `is_open` is independent of `atmosphere_type`

$H_1$ : `is_open` is not independent of `atmosphere_type`

In [None]:
# chi-square test
round(chi2_contingency(atmosphere_contingency_table)[1],2)

If we were to analyze restaurants that have atmosphere type identified, we get p-value of 0.02. Thus at $\alpha = 0.01$, we do not have sufficient evidence to reject the null hypothesis that is survival status is independent of atmosphere type. If we are willing to increase the type-I error to 0.05, we reject the null hypothesis.

# Does restaurant survival depend on its risk of covid tranmission?

## Using trasmission risk category

In [None]:
subset_df.hi_transmission_risk.value_counts(dropna = False)

In [None]:
subset_df.loc[subset_df.hi_transmission_risk.isna(), 'hi_transmission_risk'] = 'Category of Low Transmission Risk'

In [None]:
subset_df.hi_transmission_risk.value_counts()

In [None]:
transmission_contingency_table = pd.crosstab(index=subset_df['is_open'], columns=subset_df['hi_transmission_risk'])
transmission_contingency_table

In [None]:
ax = sns.heatmap(transmission_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (counts)")

In [None]:
transmission_contingency_norm_columns = pd.crosstab(index=subset_df['is_open'], columns=subset_df['hi_transmission_risk'],
                                          normalize = 'columns') * 100
transmission_contingency_norm_columns

In [None]:
ax = sns.heatmap(transmission_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (normalize by Transmission risk)")

In [None]:
transmission_contingency_norm_index = pd.crosstab(index=subset_df['is_open'], columns=subset_df['hi_transmission_risk'],
                                          normalize = 'index') * 100
transmission_contingency_norm_index

In [None]:
ax = sns.heatmap(transmission_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (normalize by survival status)")

In [None]:
# chi-square test
round(chi2_contingency(transmission_contingency_table)[1],2)

In [None]:
# create an indicator column
subset_df.loc[subset_df['hi_transmission_risk'] == 'Category of Low Transmission Risk', 'transmission_risk'] = 'Low risk'
subset_df.loc[subset_df['hi_transmission_risk'] != 'Category of Low Transmission Risk', 'transmission_risk'] = 'High risk'

In [None]:
subset_df['transmission_risk'].value_counts()

In [None]:
risk_contingency_table = pd.crosstab(index=subset_df['is_open'], columns=subset_df['transmission_risk'])
risk_contingency_table

In [None]:
ax = sns.heatmap(risk_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (counts)")

In [None]:
risk_contingency_norm_columns = pd.crosstab(index=subset_df['is_open'], columns=subset_df['transmission_risk'],
                                          normalize = 'columns') * 100
risk_contingency_norm_columns

In [None]:
ax = sns.heatmap(risk_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (normalize by Transmission risk)")

In [None]:
risk_contingency_norm_index = pd.crosstab(index=subset_df['is_open'], columns=subset_df['transmission_risk'],
                                          normalize = 'index') * 100
risk_contingency_norm_index

In [None]:
ax = sns.heatmap(risk_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. Transmission risk (normalize by survival status)")

$H_0 : $ `is_open` is independent of `transmission_risk`

$H_1$ : `is_open` is not independent of `transmission_risk`

In [None]:
# chi-square test
round(chi2_contingency(risk_contingency_table)[1],2)

Reject the null hypothesis that restaurant survival status is independent of the transmission risk of its categories.

## Using covid incidence

In [None]:
# load the covid rate file
covid_df = pd.read_csv('covid_incidence_by_city.csv')

In [None]:
median_incidence = np.median(covid_df.incidence)
covid_df.loc[covid_df.incidence <= median_incidence, 'risk'] = 'low'
covid_df.loc[covid_df.incidence > median_incidence, 'risk'] = 'high'

In [None]:
covid_df

In [None]:
cities_hi_covid_incidence = ['ORLANDO', 'BOSTON']

In [None]:
restaurant_df.city.isin(cities_hi_covid_incidence).value_counts()

In [None]:
restaurant_df.city.value_counts() #sanity check

In [None]:
is_high_incidence = restaurant_df.city.isin(cities_hi_covid_incidence)
restaurant_df.loc[is_high_incidence, 'covid_incidence'] = 'High'
restaurant_df.loc[~is_high_incidence, 'covid_incidence'] = 'Low'

In [None]:
restaurant_df.covid_incidence.value_counts(dropna = False)

In [None]:
# create contingency table
covid_incidence_contingency_table = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['covid_incidence'])
covid_incidence_contingency_table

In [None]:
ax = sns.heatmap(covid_incidence_contingency_table, cmap="Blues")
ax.set_title("Restaurant survival status vs. COVID incidence (counts)")
plt.show()

In [None]:
covid_contingency_norm_columns = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['covid_incidence'],
                                          normalize = 'columns') * 100
covid_contingency_norm_columns

In [None]:
ax = sns.heatmap(covid_contingency_norm_columns, cmap="Blues")
ax.set_title("Restaurant survival status vs. COVID incidence (column normalized)")
plt.show()

In [None]:
covid_contingency_norm_index = pd.crosstab(index=restaurant_df['is_open'], columns=restaurant_df['covid_incidence'],
                                          normalize = 'index') * 100
covid_contingency_norm_index

In [None]:
ax = sns.heatmap(covid_contingency_norm_index, cmap="Blues")
ax.set_title("Restaurant survival status vs. COVID incidence (index normalized)")
plt.show()

In [None]:
# chi-square test
round(chi2_contingency(covid_incidence_contingency_table)[1],2)