# Yelp Dataset Analysis: Philadelphia Restaurants

## Data Loading and Preprocessing

I use three parts of the Yelp Open Dataset: the business file `yelp_academic_dataset_business.json`, the check-in file `yelp_academic_dataset_checkin.json`, and the tip file `yelp_academic_dataset_tip.json`. The business file provides each restaurant’s basic information, ratings, review counts, and open status. The check-in file contains timestamped user check-ins, which I convert into a simple count. The tip file contains short user tips, which I summarize into a tip count and an average compliment count.


I merge these three datasets, keep only restaurants, fill missing engagement values with zeros, and standardize ZIP codes and city names. I then merge in ZIP-level income information from `income-zip.csv` and filter the data to restaurants located in Philadelphia. After removing rows without coordinates, I aggregate to the ZIP level to compute restaurant counts, the share still open, and average engagement metrics. I then merge this summary with the ZIP boundary GeoJSON file `zipcodes_poly.geojson` to prepare the data for mapping and save all cleaned tables for later analysis.

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from pathlib import Path

DATA_DIR = Path("../data")

# Load Yelp business file and keep restaurants
# We keep the following columns from `yelp_academic_dataset_business.json`: 
# `business_id`, `name`, `address`, `city`, `state`, `postal_code`, `latitude`, 
# `longitude`, `stars`, `review_count`, `is_open`, `attributes`, `categories`, `hours`.
biz_path = DATA_DIR / "yelp_academic_dataset_business.json"
biz = pd.read_json(biz_path, lines=True)
biz = biz[biz["categories"].str.contains("Restaurant", na=False)].copy()


# Load Yelp check-in file and summarize
checkin_path = DATA_DIR / "yelp_academic_dataset_checkin.json"
checkin = pd.read_json(checkin_path, lines=True)

# Each row has business_id and a comma-separated list of timestamps in "date"
checkin["checkin_count"] = checkin["date"].apply(
    lambda x: len(x.split(", ")) if isinstance(x, str) else 0
)
checkin = checkin[["business_id", "checkin_count"]]

# Load Yelp tip file and summarize per business
tip_path = DATA_DIR / "yelp_academic_dataset_tip.json"
tip = pd.read_json(tip_path, lines=True)

tip_summary = (
    tip.groupby("business_id")
       .agg(
           tip_count=("text", "count"),
           avg_compliments=("compliment_count", "mean")
       )
       .reset_index()
)


# Merge Yelp business + checkin + tip
yelp = (
    biz.merge(checkin, on="business_id", how="left")
       .merge(tip_summary, on="business_id", how="left")
       .copy()
)

yelp["checkin_count"]   = yelp["checkin_count"].fillna(0).astype(int)
yelp["tip_count"]       = yelp["tip_count"].fillna(0).astype(int)
yelp["avg_compliments"] = yelp["avg_compliments"].fillna(0.0)
yelp["city"] = yelp["city"].astype(str).str.lower()
yelp["postal_code"] = yelp["postal_code"].astype(str).str[:5].str.zfill(5)

# Load Income by ZIP and keep many columns for future use
inc_path = DATA_DIR / "income-zip.csv"  # your CSV from RowZero
income = pd.read_csv(inc_path)


rename_map = {
    "ZIP": "zip",
    "Total population": "population",
    "Households - Median income (dollars)": "median_income",
    "Households - Mean income (dollars)": "mean_income",
}
income = income.rename(columns=rename_map)


income["zip"] = income["zip"].astype(str).str.zfill(5)

# Merge income onto Yelp by ZIP
merged = yelp.merge(income, left_on="postal_code", right_on="zip", how="left")


# Filter to Philadelphia city proper
philly = merged[merged["city"] == "philadelphia"].copy()


philly = philly.dropna(subset=["latitude", "longitude"])
fill_zeros = ["checkin_count", "tip_count", "avg_compliments"]
philly[fill_zeros] = philly[fill_zeros].fillna(0)

if philly["median_income"].isna().any():
    philly["median_income"] = philly["median_income"].fillna(philly["median_income"].median())

# Aggregate to ZIP for choropleths and city summaries
zip_summary = (
    philly.groupby("postal_code")
          .agg(
              n_businesses = ("business_id", "nunique"),
              pct_open     = ("is_open", "mean"),
              avg_stars    = ("stars", "mean"),
              avg_reviews  = ("review_count", "mean"),
              avg_checkins = ("checkin_count", "mean"),
              avg_tips     = ("tip_count", "mean"),
              avg_comps    = ("avg_compliments", "mean"),
              median_income= ("median_income", "mean")
          )
          .reset_index()
)

# Join ZIP polygons for mapping and save reusable files
geo_path = DATA_DIR / "zipcodes_poly.geojson"  # OpenDataPhilly "ZIP Codes – Polygon (GeoJSON)"
geo = gpd.read_file(geo_path)
geo["CODE"] = geo["CODE"].astype(str).str.zfill(5)

merged_geo = geo.merge(zip_summary, left_on="CODE", right_on="postal_code", how="left")

# Save quick artifacts so you can reload without recomputing
DATA_DIR.mkdir(parents=True, exist_ok=True)
philly_out      = DATA_DIR / "philly_yelp_income_clean.csv"
zip_summary_out = DATA_DIR / "philly_zip_summary.csv"
geo_out         = DATA_DIR / "merged_philly.geojson"

philly.to_csv(philly_out, index=False)
zip_summary.to_csv(zip_summary_out, index=False)
merged_geo.to_file(geo_out, driver="GeoJSON")

print("Rows in Yelp restaurants:", len(yelp))
print("Rows in Philadelphia subset:", len(philly))
print("ZIPs in summary:", len(zip_summary))
print("Geo polygons merged:", len(merged_geo))
print("Saved:", philly_out.name, zip_summary_out.name, geo_out.name)

Rows in Yelp restaurants: 52286
Rows in Philadelphia subset: 5856
ZIPs in summary: 57
Geo polygons merged: 48
Saved: philly_yelp_income_clean.csv philly_zip_summary.csv merged_philly.geojson


### Importing the Philadelphia GeoJson files

In [2]:
import geopandas as gpd

geo = gpd.read_file("../data/merged_philly.geojson")
print(geo.columns.tolist())
print(geo.head(3))


['OBJECTID', 'CODE', 'COD', 'Shape__Area', 'Shape__Length', 'postal_code', 'n_businesses', 'pct_open', 'avg_stars', 'avg_reviews', 'avg_checkins', 'avg_tips', 'avg_comps', 'median_income', 'geometry']
   OBJECTID   CODE  COD   Shape__Area  Shape__Length postal_code  \
0         1  19120   20  1.456207e+07   19887.714114       19120   
1         2  19121   21  1.102598e+07   15728.621590       19121   
2         3  19122   22  5.689181e+06    9599.539345       19122   

   n_businesses  pct_open  avg_stars  avg_reviews  avg_checkins  avg_tips  \
0          88.0  0.795455   3.357955    30.170455     50.761364  5.204545   
1          61.0  0.557377   3.204918    35.557377     47.540984  4.786885   
2         103.0  0.669903   3.815534    63.485437     87.009709  7.553398   

   avg_comps  median_income                                           geometry  
0   0.010085        51993.0  POLYGON ((-75.11107 40.04682, -75.10943 40.045...  
1   0.005941        36208.0  POLYGON ((-75.19227 39.994

### **Visualizaiton 1: Mapping Restaurant Survival by ZIP**

Here I load the merged_philly.geojson file and plot a map of Philadelphia ZIP codes. Each area is colored by the share of restaurants that are still open. The tooltip shows the ZIP code, how many restaurants are in that area, and the percent that remain open. This gives a quick spatial view of which parts of the city have higher or lower survival rates.

In [6]:
import altair as alt

geo = alt.Data(
    url='../data/merged_philly.geojson',
    format={'type': 'json', 'property': 'features'}  # <-- important
)

pct_open_map = (
    alt.Chart(geo)
      .mark_geoshape(stroke='white', strokeWidth=0.5)
      .encode(
          color=alt.Color('properties.pct_open:Q', title='Share open',
                          scale=alt.Scale(scheme='greens')),
          tooltip=[
              alt.Tooltip('properties.postal_code:N', title='ZIP'),
              alt.Tooltip('properties.n_businesses:Q', title='# restaurants', format=','),
              alt.Tooltip('properties.pct_open:Q', title='Share open', format='.2f')
          ]
      )
      .properties(title='Philadelphia: Share of Restaurants Still Open by ZIP',
                  width=650, height=450)
      .project('mercator')
)
pct_open_map


### **Visualizaiton 2: Mapping Median Income by ZIP**

I also map median household income using the same ZIP code shapes from `merged_philly.geojson`. Each ZIP is colored by its median income to show how wealth varies across Philadelphia. The tooltip lists the ZIP code and its income value. Having this map next to the restaurant survival map makes it easy to compare economic patterns with where restaurants tend to stay open.

In [7]:
income_map = (
    alt.Chart(geo)
      .mark_geoshape(stroke='white', strokeWidth=0.5)
      .encode(
          color=alt.Color('properties.median_income:Q', title='Median income ($)',
                          scale=alt.Scale(scheme='blues')),
          tooltip=[
              alt.Tooltip('properties.postal_code:N', title='ZIP'),
              alt.Tooltip('properties.median_income:Q', title='Median income', format=',')
          ]
      )
      .properties(title='Philadelphia: Median Household Income by ZIP',
                  width=650, height=450)
      .project('mercator')
)
income_map


### **Visualization 3: Income vs. Restaurant Survival in Philadelphia**

To check whether neighborhood wealth predicts restaurant survival, I plot median household income against the share of restaurants still open.


Each point shows a Philadelphia ZIP code, plotted by its median household income and the share of restaurants that remain open. Point size indicates how many restaurants are in that ZIP. The weak relationship suggests that higher-income areas are not necessarily the ones where restaurants survive at higher rates. I also included a tooltip for more details when you hover your mouse over each point.


 The fitted regression line is almost flat, and the points show a wide amount of scatter. The regression slope is very small and slightly negative:

```
pct_open = -0.000003 * income + 0.8559
```

The correlation is also weak (r = –0.311). Together, these results show that ZIP-level income does not provide a strong explanation for which neighborhoods kept more restaurants open.


In [20]:
income_scatter = (
    alt.Chart(zip_summary)
      .mark_circle()
      .encode(
          x=alt.X('median_income:Q', title='Median Household Income ($)'),
          y=alt.Y('pct_open:Q', title='Share of Restaurants Still Open'),
          size=alt.Size('n_businesses:Q', title='# of Restaurants'),
          tooltip=['postal_code:N','median_income:Q','pct_open:Q','n_businesses:Q']
      )
      .properties(
          title='Income vs Restaurant Survival (ZIP level)',
          width=500,
          height=350
      )
)

income_line = (
    alt.Chart(zip_summary)
      .transform_regression('median_income', 'pct_open')
      .mark_line(color='red', size=2)
      .encode(
          x='median_income:Q',
          y='pct_open:Q'
      )
)

income_scatter + income_line


In [22]:
x = zip_summary["median_income"]
y = zip_summary["pct_open"]

# regression line
slope, intercept = np.polyfit(x, y, 1)

# correlation
corr = x.corr(y)

print("Income vs Survival Regression:")
print(f"  pct_open = {slope:.6f} * income + {intercept:.4f}")
print(f"Correlation r = {corr:.3f}")


Income vs Survival Regression:
  pct_open = -0.000003 * income + 0.8559
Correlation r = -0.311


### **Visualization 4: Check-ins & Review Counts vs Restaurant Survival**

### Check-ins (Customer Activity) vs Restaurant Survival

Check-ins are a proxy for customer foot traffic. If foot traffic helped restaurants survive, we would expect a positive relationship. Instead, the scatterplot shows a slight downward trend as average check-ins increase. The fitted regression line confirms this:

```
pct_open = -0.0009 * check-ins + 0.7673
```


The correlation is modest and negative (r = –0.388). This suggests that ZIPs with higher check-in activity did not necessarily have higher restaurant survival. Some high-activity areas actually have lower survival rates.

### Review Counts (Customer Engagement) vs Restaurant Survival


Review counts reflect how many customers interact with a restaurant on Yelp. Like the check-in results, the scatterplot for reviews shows no upward pattern. The regression slope is again negative:

```
pct_open = -0.0018 * reviews + 0.7878
```


The correlation is similar in size (r = –0.393). This means that ZIPs where restaurants receive more reviews do not show higher survival rates. Higher engagement at the ZIP level does not reliably translate into better outcomes.

In [17]:
import altair as alt
import numpy as np

# ---------- Check-ins ----------
checkins_scatter = (
    alt.Chart(zip_summary)
      .mark_circle()
      .encode(
          x=alt.X('avg_checkins:Q', title='Average Check-ins per Restaurant'),
          y=alt.Y('pct_open:Q', title='Share of Restaurants Still Open'),
          size=alt.Size('n_businesses:Q', title='# of Restaurants'),
          tooltip=['postal_code:N','avg_checkins:Q','pct_open:Q','n_businesses:Q']
      )
)

checkins_line = (
    alt.Chart(zip_summary)
      .transform_regression('avg_checkins', 'pct_open')
      .mark_line(color='red', size=2)
      .encode(
          x='avg_checkins:Q',
          y='pct_open:Q'
      )
)

checkins_chart = (checkins_scatter + checkins_line).properties(
    title='Check-ins vs Restaurant Survival (ZIP level)',
    width=400, height=300
)

# ---------- Reviews ----------
reviews_scatter = (
    alt.Chart(zip_summary)
      .mark_circle()
      .encode(
          x=alt.X('avg_reviews:Q', title='Average Review Count'),
          y=alt.Y('pct_open:Q', title='Share of Restaurants Still Open'),
          size=alt.Size('n_businesses:Q', title='# of Restaurants'),
          tooltip=['postal_code:N','avg_reviews:Q','pct_open:Q','n_businesses:Q']
      )
)

reviews_line = (
    alt.Chart(zip_summary)
      .transform_regression('avg_reviews', 'pct_open')
      .mark_line(color='red', size=2)
      .encode(
          x='avg_reviews:Q',
          y='pct_open:Q'
      )
)

reviews_chart = (reviews_scatter + reviews_line).properties(
    title='Review Counts vs Restaurant Survival (ZIP level)',
    width=400, height=300
)

# Side by side
checkins_chart | reviews_chart


In [None]:
def summarize_linear(x, y, label):
    slope, intercept = np.polyfit(x, y, 1)
    corr = x.corr(y)
    print(f"{label}:")
    print(f"  pct_open = {slope:.4f} * x + {intercept:.4f}")
    print(f"  correlation r = {corr:.3f}")
    print()

summarize_linear(zip_summary["avg_checkins"], zip_summary["pct_open"],
                 "Average check-ins")
summarize_linear(zip_summary["avg_reviews"], zip_summary["pct_open"],
                 "Average reviews")


Average check-ins:
  pct_open = -0.0009 * x + 0.7673
  correlation r = -0.388

Average reviews:
  pct_open = -0.0018 * x + 0.7878
  correlation r = -0.393



### Summary of ZIP-level Relationships

Across income, check-ins, and review counts, all three ZIP-level regressions show weak negative trends and modest negative correlations. None of these variables strongly predicts which neighborhoods kept more restaurants open. This suggests that ZIP-level characteristics may not capture the main drivers of restaurant survival, and that restaurant-level factors or category-level differences may explain more of the variation.

In [25]:
import geopandas as gpd

# load merged ZIP GeoJSON
gdf = gpd.read_file("../data/merged_philly.geojson")

# project to a metric CRS so areas are meaningful
gdf_proj = gdf.to_crs(epsg=3857)   # Web Mercator; good enough for km²

# area in square kilometers
gdf_proj["area_km2"] = gdf_proj.geometry.area / 1e6

# restaurant density
gdf_proj["restaurants_per_km2"] = gdf_proj["n_businesses"] / gdf_proj["area_km2"]

# keep a plain DataFrame version for Altair scatter
zip_density = gdf_proj[["postal_code", "n_businesses", "pct_open",
                        "area_km2", "restaurants_per_km2"]].copy()


### **Visualization 5: Restaurant Density as a Possible Factor**

Restaurant density might matter for survival. There are two different ways density could affect the outcome. High density could increase competition and make it harder for weaker restaurants to stay open. High density could also signal a destination dining area where restaurants benefit from foot traffic and variety. Because these forces point in opposite directions, I look at both restaurant count and restaurant density by ZIP.


I plot these two measures against the share of restaurants still open and use a LOESS smoother to look for any clear pattern. The LOESS curve helps show the shape of the relationship without assuming it is linear.

In [28]:
geo_density = alt.Data(
    url="../data/merged_philly.geojson",
    format={"type": "json", "property": "features"}
)

density_map = (
    alt.Chart(geo_density)
      .mark_geoshape(stroke="white", strokeWidth=0.5)
      .encode(
          color=alt.Color("properties.n_businesses:Q",
                          title="# of restaurants",
                          scale=alt.Scale(scheme="blues")),
          tooltip=[
              alt.Tooltip("properties.postal_code:N", title="ZIP"),
              alt.Tooltip("properties.n_businesses:Q",
                          title="# restaurants", format=",")
          ]
      )
      .properties(
          title="Number of restaurants by ZIP in Philadelphia",
          width=650,
          height=450
      )
      .project("mercator")
)

density_map


In [35]:
# Restaurant count vs survival
scatter_n = (
    alt.Chart(zip_density)
      .mark_circle()
      .encode(
          x=alt.X("n_businesses:Q", title="# of restaurants in ZIP"),
          y=alt.Y("pct_open:Q", title="Share of restaurants still open"),
          tooltip=["postal_code:N", "n_businesses:Q", "pct_open:Q"]
      )
      .properties(
          width=400,
          height=300
      )
)

loess_counts = (
    alt.Chart(zip_density)
      .transform_loess('n_businesses', 'pct_open', bandwidth=0.5)
      .mark_line(color='red', size=2)
      .encode(x='n_businesses:Q', y='pct_open:Q')
)

(scatter_n + loess_counts)


chart_counts = (
    (scatter_n + loess_counts)
    .properties(title="Restaurant count vs survival (ZIP level)")
)


# Restaurant density vs survival
scatter_density = (
    alt.Chart(zip_density)
      .mark_circle()
      .encode(
          x=alt.X("restaurants_per_km2:Q",
                  title="Restaurants per km²"),
          y=alt.Y("pct_open:Q", title="Share of restaurants still open"),
          tooltip=["postal_code:N", "restaurants_per_km2:Q", "pct_open:Q"]
      )
      .properties(
          width=400,
          height=300
      )
)

loess_density = (
    alt.Chart(zip_density)
      .transform_loess('restaurants_per_km2', 'pct_open', bandwidth=0.5)
      .mark_line(color='red', size=2)
      .encode(x='restaurants_per_km2:Q', y='pct_open:Q')
)

(scatter_density + loess_density)


chart_density = (
    (scatter_density + loess_density)
    .properties(title="Restaurant density vs survival (ZIP level)")
)


# Side by side
chart_counts | chart_density

**What the Density Plots Show**

There is a small relationship between density and survival, but it only appears at the low end of the distribution. ZIPs with fewer restaurants or lower density tend to have higher survival rates. Once ZIPs become moderately or highly dense, the pattern largely disappears. The LOESS line flattens, and dense areas all cluster around similar mid-range survival values. This means density has a limited effect. It helps explain why some low-density neighborhoods have strong survival, but it does not differentiate outcomes among the denser ZIP codes.

### Why Categories and Chain Status Matter

We later examine restaurant categories and chain statuses. 


Restaurant type is one of the strongest predictors of whether a business survives. Different kinds of restaurants face very different cost structures, customer expectations, and levels of competition. For example, pizza and Mexican restaurants often have stable demand and lower marginal costs, while full-service American restaurants may rely more on discretionary spending and face higher labor requirements.

Chain affiliation also plays an important role. Chains can buffer short-term shocks through corporate resources, brand recognition, and standardized operations. Independent restaurants, in contrast, may depend heavily on neighborhood foot traffic and local customer loyalty. Looking at both category and chain status helps us understand what types of restaurants are structurally more resilient.

In [None]:
# Category: Simplify categories into broader groups

def simplify_category(cat_string):
    """
    Simplify a Yelp categories string into a broader category.
    """
    if pd.isna(cat_string):
        return "Other"
    
    cat_string = cat_string.lower()
    
    if "pizza" in cat_string:
        return "Pizza"
    if "chinese" in cat_string:
        return "Chinese"
    if "mexican" in cat_string:
        return "Mexican"
    if "coffee" in cat_string or "cafe" in cat_string:
        return "Coffee & Tea"
    if "bar" in cat_string or "pub" in cat_string:
        return "Bars & Pubs"
    if "fast food" in cat_string or "burgers" in cat_string:
        return "Fast Food"
    if "american" in cat_string:
        return "American"
    
    return "Other"

philly["category_simple"] = philly["categories"].apply(simplify_category)

# Category summary
cat_summary = (
    philly.groupby("category_simple")
          .agg(
              pct_open=("is_open", "mean"),
              n=("business_id", "count")
          )
          .reset_index()
)

In [52]:
# Chain: identify chains by duplicated names
philly["is_chain"] = philly["name"].duplicated(keep=False)
chain_summary = (
    philly.groupby("is_chain")
          .agg(
              pct_open=("is_open", "mean"),
              n=("business_id", "count")
          )
          .reset_index()
)

chain_summary["chain_label"] = chain_summary["is_chain"].map({True: "Chain", False: "Independent"})


Before plotting, I summarize survival rates by category and by chain status and keep
track of the number of restaurants in each group. The tables below report the count
`n` and the share of restaurants still open, so the plotted differences are based on
reasonably large groups rather than a few outliers.


In [None]:
# Category summary table (sorted by sample size)
category_summary_display = (
    cat_summary[["category_simple", "n", "pct_open"]]
    .sort_values("n", ascending=False)
)

category_summary_display


Unnamed: 0,category_simple,n,pct_open
6,Other,1679,0.574151
1,Bars & Pubs,1124,0.588968
7,Pizza,800,0.66125
3,Coffee & Tea,701,0.629101
2,Chinese,467,0.635974
0,American,450,0.506667
4,Fast Food,335,0.626866
5,Mexican,300,0.656667


In [58]:
# Chain vs independent summary table
chain_summary_display = chain_summary.copy()
chain_summary_display

Unnamed: 0,is_chain,pct_open,n,chain_label
0,False,0.586482,4764,Independent
1,True,0.672161,1092,Chain


In [40]:
cat_chart = (
    alt.Chart(cat_summary)
      .mark_bar()
      .encode(
          x=alt.X("pct_open:Q", title="Share of Restaurants Still Open"),
          y=alt.Y("category_simple:N", sort="-x", title="Category"),
          color=alt.Color("pct_open:Q", scale=alt.Scale(scheme="greens")),
          tooltip=["category_simple:N", "pct_open:Q", "n:Q"]
      )
      .properties(
          title="Restaurant Survival by Category (Philadelphia)",
          width=450,
          height=300
      )
)

cat_chart


In [42]:
chain_chart = (
    alt.Chart(chain_summary)
      .mark_bar()
      .encode(
          x=alt.X("pct_open:Q", title="Share Open"),
          y=alt.Y("chain_label:N", sort="-x", title="Restaurant Type"),
          color=alt.Color("pct_open:Q", scale=alt.Scale(scheme="blues")),
          tooltip=["chain_label:N", "pct_open:Q", "n:Q"]
      )
      .properties(
          title="Survival Rates: Chains vs Independents",
          width=350,
          height=200
      )
)

chain_chart
