## Preparation

In [1]:
# Using SQL + Pandas
import pandas as pd
import sqlite3

# Data Visualization
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in the data
DC_Yelp = pd.read_csv("Data/Yelp/DC_Yelp.csv")
Chicago_Yelp = pd.read_csv("Data/Yelp/Chicago_Yelp.csv")
NY_Yelp = pd.read_csv("Data/Yelp/NY_Yelp.csv")
SF_Yelp = pd.read_csv("Data/Yelp/SF_Yelp.csv")
LA_Yelp = pd.read_csv("Data/Yelp/LA_Yelp.csv")

## Clean DC_Yelp

In [3]:
# View the head of the data
DC_Yelp.head()

Unnamed: 0,business_name,rating,review_count,price_category
0,Crystal City Restaurant Gentleman’s Club,3 star rating,83,"$$American (New), Bars, Strip Clubs"
1,201. Due South,3.5 star rating,548,"$$Southern, Barbeque"
2,202. Shawarma District,4.5 star rating,43,"$Mediterranean, Salad, Wraps"
3,203. Moh Moh Licious,4.5 star rating,78,"$Himalayan/Nepalese, Asian Fusion"
4,204. The Best Sandwich Place,4.5 star rating,56,"$Breakfast & Brunch, Sandwiches, Wraps"


In [4]:
# Print the shape of the data
DC_Yelp.shape

(262, 4)

In [5]:
# Get the repeated ad business
ad = DC_Yelp.business_name[0]

# Remove the repeated ad business
DC_Yelp = DC_Yelp[DC_Yelp.business_name != ad].reset_index(drop=True)

DC_Yelp

Unnamed: 0,business_name,rating,review_count,price_category
0,201. Due South,3.5 star rating,548,"$$Southern, Barbeque"
1,202. Shawarma District,4.5 star rating,43,"$Mediterranean, Salad, Wraps"
2,203. Moh Moh Licious,4.5 star rating,78,"$Himalayan/Nepalese, Asian Fusion"
3,204. The Best Sandwich Place,4.5 star rating,56,"$Breakfast & Brunch, Sandwiches, Wraps"
4,205. Pho Viet,4 star rating,710,$$Vietnamese
...,...,...,...,...
235,46. Dumplings & Beyond,4 star rating,434,$$Chinese
236,47. Pisco Y Nazca,4.5 star rating,829,"$$Peruvian, Cocktail Bars"
237,48. Bad Saint,4 star rating,605,$$$Filipino
238,49. Busboys and Poets - 450K,4 star rating,2353,"$$American (Traditional), Breakfast & Brunch, ..."


In [6]:
# Split business_name into two columns
DC_Yelp[['index','business_name']] = (DC_Yelp
                                      .business_name
                                      .str
                                      .split(".\xa0",expand=True))

# Convert index to int
DC_Yelp['index'] = DC_Yelp['index'].astype(int)

# Sort values along index
DC_Yelp = (DC_Yelp
           .sort_values('index')
           .drop('index',axis=1)
           .reset_index(drop=True))

DC_Yelp

Unnamed: 0,business_name,rating,review_count,price_category
0,Le Diplomate,4.5 star rating,3596,"$$$Brasseries, French, Breakfast & Brunch"
1,Gypsy Kitchen,4.5 star rating,70,"Tapas/Small Plates, Mediterranean"
2,Butter Me Up,4.5 star rating,103,Breakfast & Brunch
3,The Block,5 star rating,12,"Food Court, Bars, Asian Fusion"
4,The Alibi,4.5 star rating,441,"$$Pubs, Sandwiches, Barbeque"
...,...,...,...,...
235,Logan Tavern,4 star rating,872,"$$American (New), Sports Bars, American (Tradi..."
236,Uzu,4.5 star rating,87,$$Ramen
237,Akira Ramen & Izakaya DC,4.5 star rating,49,$$Ramen
238,City Kabob and Curry House - Washington,4.5 star rating,138,"$$Pakistani, Indian, Halal"
