In [132]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [133]:
# import yelp ratings pickle file

In [134]:
df = pd.read_pickle('data/yelp_ratings.pkl')

In [135]:
df.head(15)

Unnamed: 0,0
0,<span>Nojo Ramen Tavern</span>
1,<span>Fog Harbor Fish House</span>
2,<span>Tropisueño</span>
3,<span>Rich Table</span>
4,<span>La Ciccia</span>
5,<span>Straw</span>
6,<span>Monsieur Benjamin</span>
7,<span>Mina’s Brazilian Restaurant</span>
8,<span>Absinthe Brasserie &amp; Bar</span>
9,<span>RM 212</span>


In [136]:
ratings_lst = list(df.iloc[:,0])

In [137]:
len(ratings_lst)

8810

In [138]:
# extract restaurant names first

In [139]:
rnames = []
for line in ratings_lst:
    #print(line[1:5])
    if line[1:5] == 'span':
        name = line.split('<span>')
        name2 = name[1].split('</span>')
        rnames.append(name2[0])

In [140]:
len(rnames)

4405

In [141]:
r_ratings = []
for line in ratings_lst:
    #print(line[1:5])
    if line[1:4] == 'img':
        name = line.split('<img alt="')
        name2 = name[1].split(' star')
        r_ratings.append(name2[0])

In [142]:
len(r_ratings)

4405

In [143]:
r_ratings_f = list(map(float, r_ratings))

In [144]:
# let's join these two lists together

In [145]:
yelp_ratings = dict(zip(rnames, r_ratings_f))

In [146]:
yelp_ratings

{'Nojo Ramen Tavern': 4.0,
 'Fog Harbor Fish House': 4.0,
 'Tropisueño': 4.0,
 'Rich Table': 4.0,
 'La Ciccia': 4.5,
 'Straw': 4.0,
 'Monsieur Benjamin': 4.0,
 'Mina’s Brazilian Restaurant': 4.0,
 'Absinthe Brasserie &amp; Bar': 4.0,
 'RM 212': 4.5,
 'Zuni Café': 4.0,
 'Espetus Brazilian Steak House': 4.0,
 'Lao Table': 4.0,
 'Suppenküche': 4.0,
 'Tsubasa Sushi': 4.5,
 'Kokkari Estiatorio': 4.5,
 'Wayfare Tavern': 4.0,
 'Hops &amp; Hominy': 4.0,
 'Kitchen Istanbul': 4.5,
 'Souvla': 4.0,
 'O-Toro Sushi': 4.0,
 'Barbacco': 4.0,
 'B Star Bar': 4.0,
 'Revelry Bistro': 4.5,
 'a Mano': 4.0,
 'Nightbird': 4.0,
 'Surisan': 4.5,
 'Dobbs Ferry Restaurant': 4.0,
 'Papito Hayes': 4.0,
 'Hakkasan': 4.0,
 'Bella Trattoria': 4.0,
 'Benu': 4.5,
 'Manzoni': 4.0,
 'Le P’tit Laurent': 4.0,
 'Gary Danko': 4.5,
 'Sauce - Belden': 4.0,
 'Cafe Europa': 4.5,
 'Scoma’s Restaurant': 4.0,
 'Chez Maman West': 4.0,
 'Burma Superstar': 4.0,
 'Trestle': 4.5,
 'Camp BBQ': 4.0,
 'My Pot': 4.0,
 'Braised + Bread': 5.0,

In [147]:
len(yelp_ratings)

2891

In [148]:
# match restaurant names and input to ratings column. This process will go after step remove_rows_zero_violation

In [149]:
df_last = pd.read_pickle('data/sf_inspection.pkl')

In [150]:
df_last.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,94129,94130,94131,94132,94133,94134,94143,94158,95105,p10_36
0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",,...,0,0,0,0,0,0,0,0,0,11
1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",14155830000.0,...,0,0,0,0,0,0,0,0,0,13
2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,37.754343,-122.403989,,,...,0,0,0,0,0,0,0,0,0,8
3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,,,,...,0,0,0,0,0,0,0,0,0,6
4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,37.795305,-122.406613,,,...,0,0,0,0,0,0,0,0,0,12


In [151]:
for bname in df_last['business_name']:
    print(bname)

Dar Bar Pakistani/Indian Cusine
DRAGON CITY BAKERY & CAFE
Deli 23
L'acajou Bakery and Cafe
Sweetheart Cafe
Cafe New Honolulu
Aux Delices Vietnam
CAFE PICARO
Jelly Donut
Limon Peruvian Rotisserie
Domino's #7764
Don Pablo
Hawaiian Drive Inn
RED A BAKERY
HAN IL KWAN
CAFE ROCCO
Michael Mina Restaurant
CELIAS RESTAURANT
AT&T - (CART 17) DELI [145105]
Chili House Inc.
Good MongKok Bakery
Sam's Chowder Mobile
Three Star Restaurant
Le Marais Bakery
Super Duper Burgers
Chowders
TONY BALONEY'S
Lers Ros Thai Noodle
Super Duper
Tap (415)
Pasta Moto of San Francisco Centre LLC
Casa Guadalupe #3
Nick's Lighthouse
AT&T -Room 5331 GILROY GARLIC FRIES/view [145161]
AT&T - FIELD CLUB CONCESSION [145185]
BURGERMEISTER
AT&T - DERBY GRILL Room 5311 [145154]
Bow Hon Restaurant
100% Dessert Cafe
Peet's Coffee & Tea
OTD
Carmelina's Taqueria
New Luen Sing Fish Market
Midway Grill (Snack Shop)
Frisco Fried
Chutney USA, Inc.
GreenSurge
New Heng Fung Inc
JADE CAFE
The Olive Garden Italian
Celtic Coffee Company
SH

In [152]:
for yname, rating in yelp_ratings.items():
    for bname in df_last['business_name']:
        yname_c = yname.strip().lower()
        bname_c = bname.strip().lower()
        if yname_c[:10] == bname_c[:10]:
            idx = df_last.loc[df_last['business_name'] == bname].index
            #print(idx)
            df_last.loc[idx, 'rating'] = rating

In [153]:
# let's give un-rated restaurants the average value.

In [154]:
mask = df_last['rating'].isnull()

In [155]:
avg_rating = df_last.loc[~mask, 'rating'].sum()/len(df_last.loc[~mask, 'rating'])

In [158]:
df_last.loc[mask,'rating'] = avg_rating

In [159]:
df_last.to_pickle('data/sf_inspection_w_ratings.pkl')