In [265]:
import pandas as pd
import pickle
import re
# ^^^ pyforest auto-imports - don't write above this line


# Imports

## Libraries
Now sure if pyforest would import them properly

In [1064]:
import time
import requests
import folium
import unidecode

## Demonym Dictionary

In [1065]:
with open("demonym_dictionary.pickle", "rb+") as f:
    demonym_dictionary = pickle.load(f)

# Adding City (Restaurant Inspection) Data
cities with most mexican immigrants in us: https://247wallst.com/economy/2017/01/27/us-cities-with-the-most-mexican-immigrants/

## NYC

In [1066]:
rest_insp = pd.read_csv("./food_inspections_data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv")

In [1067]:
rest_insp.columns

Index(['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE',
       'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

In [1068]:
rest_insp['CUISINE DESCRIPTION'].nunique()

84

In [1069]:
# rest_insp['CUISINE DESCRIPTION'].unique()

In [1070]:
# tex-mex, mexican, latin
mex_rest_insp = rest_insp[rest_insp['CUISINE DESCRIPTION'] == 'Mexican']

In [1071]:
mex_rest_insp.shape

(16656, 26)

In [1072]:
mex_rest_insp.head(2)

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
24,41571476,NATALIA BAR,Queens,62-10,39 AVENUE,11377.0,3477065942,Mexican,05/04/2018,Violations were cited in the following area(s).,...,06/14/2020,Cycle Inspection / Re-inspection,40.746844,-73.901406,402.0,26.0,26100.0,4028392.0,4012320000.0,QN63
27,40996642,RECUERDOS MEXICANOS,Staten Island,232,PORT RICHMOND AVENUE,10302.0,7188155533,Mexican,10/22/2019,Violations were cited in the following area(s).,...,06/14/2020,Cycle Inspection / Initial Inspection,40.635784,-74.134935,501.0,49.0,20700.0,5025513.0,5010820000.0,SI28


### Removing duplicate restaurants from nyc mexican restaurants df

In [1073]:
unique_nyc_mex = mex_rest_insp.drop_duplicates(subset=['DBA'], keep='first')

In [1074]:
unique_nyc_mex.reset_index(inplace=True, drop=True)

In [1075]:
unique_nyc_mex.shape

(849, 26)

In [1076]:
# a 95% reduction of the df!
unique_nyc_mex.shape[0] / mex_rest_insp.shape[0]

0.050972622478386166

In [1077]:
# list of restaurant names to compare to values in demonym_dictionary
nyc_rest_insp_names = [x.lower() for x in unique_nyc_mex['DBA']]

### Getting matches 

In [1078]:
rest_matches_nyc = []
# list of not-quite-real matches
bad_matches = ['mexican', 'tequila', 'margarita', 'margaritas', 'mexico', 'pedro', 'azteca', 'paraiso'
'mexico','rodeo','rio','maria','mexicanos','coyote','marcos','mama','bravo','viejo','perla','nuevo'
'verde', 'gonzalez', 'corona', 'armadillo', 'arriba', 'palmas', 'delicias', 'blanco', 'crespo','tortuga']
for item in nyc_rest_insp_names:
    split_item = item.split()
    for word in split_item:
        for key, value in demonym_dictionary.items():
            for city in value:
                if word == city:
                    if word not in bad_matches: # if not, the number of results is tripled
                        rest_matches_nyc.append(item)

In [1079]:
len(rest_matches_nyc)

259

In [1080]:
rest_matches_nyc

['puebla seafood',
 'tacos puebla restaurant',
 'guadalajara de dia no.  2',
 'guadalajara de dia no.  2',
 'el rincon grill & bar',
 'el rincon grill & bar',
 'el rincon grill & bar',
 'nuevo mexico mexican restaurant',
 'paraiso azteca restaurant',
 'paraiso azteca restaurant',
 'atlixco bakery & deli',
 'el tenampa restaurant',
 'el tenampa restaurant',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'guadalupe inn',
 'noches de palenque',
 'noches de palenque',
 'mi pequena cholula deli restaurante',
 'mi pequena cholula deli restaurante',
 'piaxtla es mexico deli',
 'sonora',
 'tulcingo restaurant',
 'mama  puebla',
 'estrellita poblana # 1',
 'atla',
 'el barril restaurant',
 'oaxaca taqueria',
 'el tapatio mexican restaurant',
 'cuautla morelos restaurant',
 'cuautla morelos re

In [1081]:
unique_rest_matches_nyc = set(rest_matches_nyc)

In [1082]:
# was 44 before additions to demonym dict ... was 169 before word corrections
len(unique_rest_matches_nyc)

111

### Getting Regions for Matches

In [1083]:
restaurant_matches_nyc = {key: None for key in set(unique_rest_matches_nyc)}

In [1084]:
for key, value in restaurant_matches_nyc.items():
    split_item = key.split()
    for word in split_item:
        for key1, value1 in demonym_dictionary.items():
            for city in value1:
                if word == city:
                    value_info = [key1, city]
                    restaurant_matches_nyc[key] = value_info

In [1085]:
len(restaurant_matches_nyc)

111

#### Removing all entries that passed filter incorrectly

In [1086]:
bad_keys = []
bad_matches = ['mexican','mexico','mexicanos','azteca','esperanza','estrada','esperanzas','salero','pinos',
                  'maria', 'bravo', 'nuevo', 'progreso', 'delicias', 'comales', 'palmas', 'palenque', 'concordia',
                 'china', 'gym', 'paloma', 'rio', 'mex', 'tequila', 'colorado', 'ventana', 'lom', 'garcia', 'paz',
                 'chavez', 'paraiso', 'senor', 'oriental', 'fronteras', 'tap', 'aca', 'purisima', 'rodriguez',
                 'hernandez', 'sanchez', 'victoria', 'oasis', 'cash', 'pinas', 'yaa', 'tap', 'meson','agustin',
                 'agustin', 'limon', 'alamo', 'slp', 'providencia', 'reyes', 'lom', 'verde', 'perla', 'madrid', 
                 'delta', 'mama', 'lopez', 'honey', 'laurel', 'california pizza kitchen', 'sauces', 'laguna', 
              'dolores', 'presidio', 'ver', 'bernal', 'rincon', 'marin', 'palma', 'potrero', 'mid', 'valencia',
              'aura', 'kava', 'pueblito', 'castillo', 'tam', 'marcos', 'montecristo', 'tinajas', 'alvarado',
              'porvenir', 'nieves', 'mina', 'marin', "alamos", 'reforma', 'jal', 'margaritas', 'california']
for key, value in restaurant_matches_nyc.items():
    if value[1] in bad_matches:
        bad_keys.append(key)

In [1087]:
len(bad_keys)

25

In [1088]:
for x in bad_keys:
    if restaurant_matches_nyc.get(x) != None:
        del restaurant_matches_nyc[x]

In [1089]:
len(restaurant_matches_nyc)

86

In [1090]:
list(restaurant_matches_nyc.items())[:2]

[('estrellita poblana', ['Puebla', 'poblana']),
 ('estrellita poblana taqueria express', ['Puebla', 'poblana'])]

### Getting coordinates for each restaurant

In [1091]:
region_in_name = []
for x in unique_nyc_mex['DBA']:
    if x.lower() in restaurant_matches_nyc.keys():
        region_in_name.append(1)
    else:
        region_in_name.append(0)

In [1092]:
unique_nyc_mex['region_in_name'] = region_in_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1093]:
nyc_mex_rest_w_region = unique_nyc_mex[unique_nyc_mex['region_in_name'] == 1]

In [1094]:
nyc_mex_rest_w_region.shape

(86, 27)

In [1095]:
nyc_mex_rest_w_region.DBA.nunique()

86

In [1096]:
nyc_mex_rest_w_region.reset_index(inplace=True, drop=True)

In [1097]:
nyc_mex_rest_w_region.head(2)

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,region_in_name
0,40909425,PUEBLA SEAFOOD,Queens,95-27,ROOSEVELT AVENUE,11372.0,7186723556,Mexican,04/19/2019,Violations were cited in the following area(s).,...,Cycle Inspection / Initial Inspection,40.749015,-73.870844,403.0,21.0,27300.0,4036619.0,4014830000.0,QN28,1
1,41292582,TACOS PUEBLA RESTAURANT,Bronx,2181,GRAND CONCOURSE,10453.0,7182205463,Mexican,08/13/2019,Violations were cited in the following area(s).,...,Cycle Inspection / Re-inspection,40.85533,-73.901288,205.0,14.0,23704.0,2013760.0,2031620000.0,BX40,1


### Count of regions represented in nyc mexican restaurant names

In [1098]:
regions_represented = []
for key, value in restaurant_matches_nyc.items():
    if value[0] not in regions_represented:
        regions_represented.append(value[0])

In [1099]:
regions_represented[0:3]

['Puebla', 'Yucatan', 'Guerrero']

In [1100]:
region_list = []
for x in nyc_mex_rest_w_region['DBA']:
    region_list.append(restaurant_matches_nyc[x.lower()][0])

In [1101]:
len(region_list)

86

In [1102]:
nyc_mex_rest_w_region['region'] = region_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1103]:
# i get a lot of repeats with this method 
nyc_mex_rest_w_region['region'].value_counts()

Puebla                 32
Yucatan                11
Baja California         6
Chihuahua               6
Oaxaca                  4
Morelos                 4
Veracruz                3
Tamaulipas              3
Guerrero                2
Quintana Roo            2
Jalisco                 2
Hidalgo                 2
Mexico                  1
Tlaxcala                1
Sinaloa                 1
Aguascalientes          1
Durango                 1
Tabasco                 1
Sonora                  1
Colima                  1
Baja California Sur     1
Name: region, dtype: int64

### Getting region counts as a percentage

In [1104]:
nyc_region_counts_series = nyc_mex_rest_w_region['region'].value_counts()

In [1105]:
nyc_region_counts_dict = nyc_region_counts_series.to_dict()

In [1106]:
type(nyc_region_counts_dict)

dict

In [1107]:
nyc_region_counts_dict_pct = {key: round(value/len(region_list), 2) 
                              for key, value in nyc_region_counts_dict.items()}

In [1108]:
list(nyc_region_counts_dict_pct.items())[0:5]

[('Puebla', 0.37),
 ('Yucatan', 0.13),
 ('Baja California', 0.07),
 ('Chihuahua', 0.07),
 ('Oaxaca', 0.05)]

### Getting Values that matches

In [1109]:
nyc_values = {value[1]:0 for key, value in restaurant_matches_nyc.items()}

In [1110]:
for key, value in restaurant_matches_nyc.items():
    if value[1] in nyc_values.keys():
        nyc_values[value[1]] += 1

In [1111]:
# https://careerkarma.com/blog/python-sort-a-dictionary-by-value/
sort_values = sorted(nyc_values.items(), key=lambda x: x[1], reverse=True)
for i in sort_values[0:5]:
    print(i[0], i[1])

puebla 9
poblana 5
tulcingo 5
guadalupe 4
morelos 4


### Adding to base_map

In [1112]:
base_lat = 40.7128
base_long = -74.0060
base_map = folium.Map([base_lat, base_long], zoom_start=10, tiles="cartodbpositron")

In [1113]:
len(regions_represented)

21

In [1114]:
# a color for each region
color_list = ["red" 'blue', 'green', 'purple', 'orange', 'darkred',
'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple']
#white’, ‘pink’, ‘lightblue’, ‘lightgreen’, ‘gray’, ‘black’, ‘lightgray’]
len(color_list)

11

In [1117]:
for x in range(nyc_mex_rest_w_region.shape[0]):
    rest_name = nyc_mex_rest_w_region['DBA'][x].lower()
    region_city_name = restaurant_matches_nyc[rest_name]
    color_list_index = regions_represented.index(region_city_name[0])
    if color_list_index >= 10: # to account for the most represented regions and use the colors available
        color_list_index = 10
    folium.Marker(
            location = (nyc_mex_rest_w_region['Latitude'][x], nyc_mex_rest_w_region['Longitude'][x]),
            popup = f"name '{rest_name}', region '{region_city_name[0]}'",
            icon=folium.Icon(color=color_list[color_list_index])
        ).add_to(base_map)

  # Remove the CWD from sys.path while we load stuff.


In [1118]:
base_map

### Saving the map

In [1119]:
# base_map.save("mexican_restaurants_in_new_york_city_with_region_in_name.html")

### Map Legend

In [1120]:
region_legend = []
for key, value in restaurant_matches_nyc.items():
    if value[0] not in region_legend:
        region_legend.append(value[0])

In [1121]:
full_legend = list(zip(region_legend, color_list[:-1]))

In [1122]:
# all other regions are "darkpurple"
full_legend

[('Puebla', 'redblue'),
 ('Yucatan', 'green'),
 ('Guerrero', 'purple'),
 ('Tabasco', 'orange'),
 ('Chihuahua', 'darkred'),
 ('Hidalgo', 'lightred'),
 ('Sonora', 'beige'),
 ('Morelos', 'darkblue'),
 ('Baja California', 'darkgreen'),
 ('Tlaxcala', 'cadetblue')]

### Turning dictionary to df

In [1123]:
rest_df = pd.DataFrame.from_dict(restaurant_matches_nyc, orient="index")

In [1124]:
rest_df.reset_index(inplace=True)

In [1125]:
rest_df.columns = ['Restaurant_name', 'Region', 'Matching_word_w_region']

In [1380]:
rest_df.head()

Unnamed: 0_level_0,Region,Match,City
Restaurant_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
estrellita poblana,Puebla,poblana,NYC
estrellita poblana taqueria express,Puebla,poblana,NYC
coszcal de allende restaurant,Yucatan,allende,NYC
chilpancingo restaurant,Guerrero,chilpancingo,NYC
mama puebla,Puebla,puebla,NYC


In [1379]:
rest_df.columns

Index(['Region', 'Match', 'City'], dtype='object')

#### Checking to see if "Tulcingo Deli Grocery" is in database

In [1128]:
'TULCINGO' in list(nyc_mex_rest_w_region['DBA'].unique())

True

In [1383]:
tulcingo_count = 0
for x in rest_df.Match:
    if x.lower() == 'tulcingo':
        tulcingo_count += 1

In [1384]:
tulcingo_count

5

## LA

In [1130]:
LA_data = pd.read_csv("./food_inspections_data/Map_of_Restaurants_LA.csv")

In [1131]:
LA_data.shape

(7017, 16)

In [1132]:
LA_data.head(2)

Unnamed: 0,LOCATION ACCOUNT #,BUSINESS NAME,DBA NAME,STREET ADDRESS,CITY,ZIP CODE,LOCATION DESCRIPTION,MAILING ADDRESS,MAILING CITY,MAILING ZIP CODE,NAICS,PRIMARY NAICS DESCRIPTION,COUNCIL DISTRICT,LOCATION START DATE,LOCATION END DATE,LOCATION
0,0003030535-0001-6,"NIGHT MARKET 3, INC.",,2533 LINCOLN BLVD,VENICE,90291-5042,2533 LINCOLN 90291,2533 LINCOLN BLVD,VENICE,90291-5042,722110,Full-service restaurants,11,02/01/2018,,"(33.9915, -118.449)"
1,0002893692-0002-0,NORMA L DIAZ SANCHEZ,PLAYAS EL SALVADOR RESRAURANTE,4052 1/2 S CENTRAL AVENUE,LOS ANGELES,90011-2866,4052 Central 90011-2866,,,,722110,Full-service restaurants,9,05/19/2020,,"(34.0101, -118.2563)"


In [1133]:
LA_data['LOCATION'].fillna("None", inplace=True)

In [1134]:
LA_data['LOCATION'].dtype

dtype('O')

### Making sure that there aren't any duplicate restaurants

In [1135]:
len(LA_data['BUSINESS NAME']) == len(set(LA_data['BUSINESS NAME']))

False

In [1136]:
len(LA_data['BUSINESS NAME'])

7017

In [1137]:
len(set(LA_data['BUSINESS NAME']))

6125

In [1138]:
unique_LA_data = LA_data.drop_duplicates(subset=['DBA NAME'], keep='first')

### Adding each name to a new column in LA_data

In [1139]:
possible_biz_names = list(zip(unique_LA_data['BUSINESS NAME'],unique_LA_data['DBA NAME']))

In [1140]:
list_possible_biz_names = []
for x in possible_biz_names:
    y = list(x)
    if np.nan in y:
        y.remove(np.nan)
    lower_item_list = [item.lower() for item in y]        
    list_possible_biz_names.append(lower_item_list)

In [1141]:
unique_LA_data['biz_names'] = list_possible_biz_names

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Getting Matches

In [1142]:
unique_LA_data['biz_names'][0:10]

0                                [night market 3, inc.]
1     [norma l diaz sanchez, playas el salvador resr...
3         [bombay curry inc, bombay curry and pizzeria]
4     [t & m finatic enterprise inc, sharky's fresh ...
5          [la restaurant hope street inc, the counter]
7     [sartori thai cuisine, inc., sartori thai cuis...
9                [jm brothers inc, kalbis korean grill]
10           [concordia development inc, subway #14900]
13    [thistle and thorn hospitality inc, the whale ...
14             [bibi trading inc, biriyani kabob house]
Name: biz_names, dtype: object

In [1143]:
len(unique_LA_data['biz_names'][1])

2

In [1144]:
rest_matches_la = []
# list of not-quite-real matches
bad_matches = ['mexican', 'tequila', 'margarita', 'margaritas', 'mexico', 'pedro', 'azteca', 'paraiso'
'mexico','rodeo','rio','maria','mexicanos','coyote','marcos','mama','bravo','viejo','perla','nuevo'
'verde', 'gonzalez', 'corona', 'armadillo', 'arriba', 'palmas', 'delicias', 'blanco', 'crespo','tortuga']
for item in unique_LA_data['biz_names']:
    for word in item:
        for key, value in demonym_dictionary.items():
            for city in value:
                if word == city:
                    if word not in bad_matches: # if not, the number of results is tripled
                        rest_matches_la.append(item)

In [1145]:
unique_rest_matches_la = []
for x in rest_matches_la:
    if x not in unique_rest_matches_la:
        unique_rest_matches_la.append(x)

In [1146]:
len(unique_rest_matches_la)

12

In [1147]:
unique_rest_matches_la

[['luis e ponce', 'los pilares'],
 ['rio nuevo, llc', 'salazar'],
 ['toshihiko t hoshi', 'toshi'],
 ['genoveva/guillermo padilla', 'la barca'],
 ['jesus del rio', 'el ranchito'],
 ['boca del rio inc', 'boca del rio'],
 ['fernando pena duenas', 'el caracol'],
 ['angel r montes', 'san marcos'],
 ['manuel gonzalez', 'manny mobile detailing'],
 ['tkla group inc', 'tekila'],
 ['pico productions', 'el cid'],
 ['holbox restaurant group inc', 'holbox']]

In [1148]:
# most of this data looks like it's incorrectly matching so I will not use LA's data

## Chicago 

In [1294]:
chi_data = pd.read_csv("./food_inspections_data/Food_Inspections_chicago.csv")

In [1295]:
chi_data.shape

(206916, 17)

In [1296]:
chi_data.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2373714,CHIYA CHAI CAFE,CHIYA CHAI CAFE,2432644.0,Restaurant,Risk 1 (High),2770 N MILWAUKEE AVE,CHICAGO,IL,60647.0,06/15/2020,Canvass Re-Inspection,Pass,,41.931449,-87.711547,"(-87.71154708820332, 41.93144884388019)"
1,2373680,TROPI CUBA,TROPI CUBA,1422721.0,Restaurant,Risk 1 (High),3000 W LYNDALE ST,CHICAGO,IL,60647.0,06/12/2020,Canvass Re-Inspection,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,41.922493,-87.702359,"(-87.70235877406888, 41.92249266994513)"


In [1297]:
chi_rest = chi_data[chi_data['Facility Type'] == 'Restaurant']

In [1298]:
chi_rest.shape

(137853, 17)

In [1299]:
# reduce the size of df by 33% 
chi_rest.shape[0] / chi_data.shape[0] 

0.6662268746737806

In [1300]:
chi_rest.head(1)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2373714,CHIYA CHAI CAFE,CHIYA CHAI CAFE,2432644.0,Restaurant,Risk 1 (High),2770 N MILWAUKEE AVE,CHICAGO,IL,60647.0,06/15/2020,Canvass Re-Inspection,Pass,,41.931449,-87.711547,"(-87.71154708820332, 41.93144884388019)"


### Getting Names of restaurants

In [1301]:
possible_chi_biz_names = list(zip(chi_rest['DBA Name'], chi_rest['AKA Name']))

In [1302]:
chi_biz_names = []
for x in possible_chi_biz_names:
    x_list = list(x)
    if x_list[0] == x_list[1]:
        del x_list[1]
    elif np.nan in x_list:
        x_list.remove(np.nan)
    lower_possible_chi_biz_names = [item.lower() for item in x_list]
    chi_biz_names.append(lower_possible_chi_biz_names)

In [1303]:
chi_biz_names[0:3]

[['chiya chai cafe'], ['tropi cuba'], ['china gourmet']]

In [1304]:
chi_rest['biz_names'] = chi_biz_names

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Getting Matches with Dem Dict

In [1305]:
rest_matches_chi = []
# list of not-quite-real matches
bad_matches = ['mexican','mexico','mexicanos','azteca','esperanza','estrada','esperanzas','salero','pinos',
                  'maria', 'bravo', 'nuevo', 'progreso', 'delicias', 'comales', 'palmas', 'palenque', 'concordia',
                 'china', 'gym', 'paloma', 'rio', 'mex', 'tequila', 'colorado', 'ventana', 'lom', 'garcia', 'paz',
                 'chavez', 'paraiso', 'senor', 'oriental', 'fronteras', 'tap', 'aca', 'purisima', 'rodriguez',
                 'hernandez', 'sanchez', 'victoria', 'oasis', 'cash', 'pinas', 'yaa', 'tap', 'meson','agustin',
                 'agustin', 'limon', 'alamo', 'slp', 'providencia', 'reyes', 'lom', 'verde', 'perla', 'madrid', 
                 'delta', 'mama', 'lopez', 'honey', 'laurel', 'california pizza kitchen', 'sauces', 'laguna', 
              'dolores', 'presidio', 'ver', 'bernal', 'rincon', 'marin', 'palma', 'potrero', 'mid', 'valencia',
              'aura', 'kava', 'pueblito', 'castillo', 'tam', 'marcos', 'montecristo', 'tinajas', 'alvarado',
              'porvenir', 'nieves', 'mina', 'marin']
for item in chi_rest['biz_names']:
    for word in item:
        for key, value in demonym_dictionary.items():
            for city in value:
                if word == city:
                    if word not in bad_matches: # if not, the number of results is tripled
                        rest_matches_chi.append(item)

In [1306]:
len(rest_matches_chi)

765

In [1307]:
rest_matches_chi[0:10]

[['los comales'],
 ['la amistad inc.', 'la amistad'],
 ['las palmas'],
 ['las palmas'],
 ['la fuente'],
 ['clark el ranchito corp', 'el ranchito'],
 ['clark el ranchito corp', 'el ranchito'],
 ['el milagro'],
 ['los mangos'],
 ['los mangos']]

In [1308]:
unique_rest_matches_chi = []
for x in rest_matches_chi:
    if x not in unique_rest_matches_chi:
        unique_rest_matches_chi.append(x)

In [1309]:
len(unique_rest_matches_chi)

43

In [1310]:
unique_rest_matches_chi[0:10]

[['los comales'],
 ['la amistad inc.', 'la amistad'],
 ['las palmas'],
 ['la fuente'],
 ['clark el ranchito corp', 'el ranchito'],
 ['el milagro'],
 ['los mangos'],
 ['chilango', "lito's empanadas"],
 ['las esperanzas'],
 ['el milagro', 'el milagro taqueria']]

### Getting Regions for Matches

In [1311]:
# using the DBA Name as the key
restaurant_matches_chi = {key[0]: None for key in unique_rest_matches_chi}

In [1312]:
for key, value in restaurant_matches_chi.items():
    split_item = key.split()
    for word in split_item:
        for key1, value1 in demonym_dictionary.items():
            for city in value1:
                if word == city:
                    value_info = [key1, city]
                    restaurant_matches_chi[key] = value_info

In [1313]:
len(restaurant_matches_chi)

41

In [1314]:
for key, value in restaurant_matches_chi.items():
    if value == None:
        for x in unique_rest_matches_chi:
            if x[0] == key:
                if len(x) > 1:
                    split_item = [x.split() for x in x[1:]]
                    for word in split_item:
                        for key1, value1 in demonym_dictionary.items():
                            for city in value1:
                                if word == city:
                                    value_info = [key1, city]
                                    nyc_mex_rest_w_region[key] = value_info

In [1315]:
# not sure why some still equal None....
none_counter = 0
for key, value in restaurant_matches_chi.items():
    if value == None:
        none_counter += 1
none_counter

20

In [1316]:
# only 26 are genuine matches
len(restaurant_matches_chi) - none_counter

21

In [1317]:
# matching words
matching_words_list = []
for key, value in restaurant_matches_chi.items():
    if value != None:
        matching_words_list.append(value[1])

In [1318]:
# matching_words_list

#### Removing values with a value of None

In [1319]:
bad_keys_chi = []
for key, value in restaurant_matches_chi.items():
    if value == None:
        bad_keys_chi.append(key)

#### Removing values that passed the filter incorrectly

In [1320]:
# bad_values_chi = ['mexican','mexico','mexicanos','azteca','esperanza','estrada','esperanzas','salero','pinos',
#                   'maria', 'bravo', 'nuevo', 'progreso', 'delicias', 'comales', 'palmas', 'palenque', 'concordia']
for key, value in restaurant_matches_chi.items():
    if value != None:
        if value[1] in bad_matches:
            bad_keys_chi.append(key)

In [1321]:
# removing bad keys
for x in bad_keys_chi:
    if restaurant_matches_chi.get(x) != None:
        del restaurant_matches_chi[x]
    elif restaurant_matches_chi.get(x) == None:
        del restaurant_matches_chi[x]

In [1322]:
len(restaurant_matches_chi.keys())

6

In [1323]:
restaurant_matches_chi

{'chilango': ['Ciudad de Mexico', 'chilango'],
 'los alamos': ['Tabasco', 'alamos'],
 'tecalitlan': ['Baja California', 'tecalitlan'],
 'ocotlan': ['Puebla', 'ocotlan'],
 'uruapan': ['Guerrero', 'uruapan'],
 'teloloapan': ['Guerrero', 'teloloapan']}

#### Correcting some errors in regions

In [1324]:
restaurant_matches_chi['ocotlan'][0] = 'Jalisco'

In [1325]:
restaurant_matches_chi['tecalitlan'][0] = "Jalisco"

In [1326]:
restaurant_matches_chi['uruapan'][0] = "Michoacan"

In [1327]:
restaurant_matches_chi

{'chilango': ['Ciudad de Mexico', 'chilango'],
 'los alamos': ['Tabasco', 'alamos'],
 'tecalitlan': ['Jalisco', 'tecalitlan'],
 'ocotlan': ['Jalisco', 'ocotlan'],
 'uruapan': ['Michoacan', 'uruapan'],
 'teloloapan': ['Guerrero', 'teloloapan']}

### Getting regions count

In [1183]:
regions_represented_chi = []
for key, value in restaurant_matches_chi.items():
    if value[0] not in regions_represented_chi:
        regions_represented_chi.append(value[0])

In [1184]:
regions_represented_chi[0:3]

['Ciudad de Mexico', 'Tabasco', 'Jalisco']

In [1185]:
# getting the count of values
region_dict_chi = {key: 0 for key in regions_represented_chi}

In [1186]:
for key, value in restaurant_matches_chi.items():
    if value[0] in region_dict_chi.keys():
        region_dict_chi[value[0]] += 1

In [1187]:
region_dict_chi

{'Ciudad de Mexico': 1,
 'Tabasco': 1,
 'Jalisco': 2,
 'Michoacan': 1,
 'Guerrero': 1}

In [1188]:
chicago_regions_represented = []
for key, value in region_dict_chi.items():
    item = [key, round(value/7, 2)]
    chicago_regions_represented.append(item)

In [1189]:
chicago_regions_represented

[['Ciudad de Mexico', 0.14],
 ['Tabasco', 0.14],
 ['Jalisco', 0.29],
 ['Michoacan', 0.14],
 ['Guerrero', 0.14]]

#### Making pct as a dict

In [1190]:
chicago_regions_pct = {value[0]:value[1] for value in chicago_regions_represented}

In [1191]:
chicago_regions_pct

{'Ciudad de Mexico': 0.14,
 'Tabasco': 0.14,
 'Jalisco': 0.29,
 'Michoacan': 0.14,
 'Guerrero': 0.14}

### Getting Values that matches

In [1192]:
chi_values = {value[1]:0 for key, value in restaurant_matches_chi.items()}

In [1193]:
for key, value in restaurant_matches_chi.items():
    if value[1] in chi_values.keys():
        chi_values[value[1]] += 1

In [1194]:
# https://careerkarma.com/blog/python-sort-a-dictionary-by-value/
sort_values_chi = sorted(chi_values.items(), key=lambda x: x[1], reverse=True)
for i in sort_values_chi[0:5]:
    print(i[0], i[1])

chilango 1
alamos 1
tecalitlan 1
ocotlan 1
uruapan 1


## Dallas

In [1195]:
dallas_data = pd.read_csv("./food_inspections_data/Dallas_inspections_October_2016_to_Present_.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [1196]:
dallas_data.shape

(45844, 114)

In [1197]:
dallas_data.columns

Index(['Restaurant Name', 'Inspection Type', 'Inspection Date',
       'Inspection Score', 'Street Number', 'Street Name', 'Street Direction',
       'Street Type', 'Street Unit', 'Street Address',
       ...
       'Violation Points - 24', 'Violation Detail - 24', 'Violation Memo - 24',
       'Violation Description - 25', 'Violation Points - 25',
       'Violation Detail - 25', 'Violation Memo - 25', 'Inspection Month',
       'Inspection Year', 'Lat Long Location'],
      dtype='object', length=114)

### Only unique dallas restaurants

In [1198]:
unique_dallas_data = dallas_data.drop_duplicates(subset=['Restaurant Name'], keep='first')

In [1199]:
unique_dallas_data.shape

(7474, 114)

In [1200]:
# about an 84% reduction in size of df
unique_dallas_data.shape[0] / dallas_data.shape[0]

0.16303114911438793

### Removing columns from data

In [1201]:
# most of the columns relate to inspections
col_names = list(unique_dallas_data.columns)

In [1202]:
col_names[0:11]

['Restaurant Name',
 'Inspection Type',
 'Inspection Date',
 'Inspection Score',
 'Street Number',
 'Street Name',
 'Street Direction',
 'Street Type',
 'Street Unit',
 'Street Address',
 'Zip Code']

In [1203]:
col_names[-3:]

['Inspection Month', 'Inspection Year', 'Lat Long Location']

In [1204]:
unique_dallas_data.drop(columns = [x for x in col_names[11:-4]], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [1205]:
unique_dallas_data.head(2)

Unnamed: 0,Restaurant Name,Inspection Type,Inspection Date,Inspection Score,Street Number,Street Name,Street Direction,Street Type,Street Unit,Street Address,Zip Code,Violation Memo - 25,Inspection Month,Inspection Year,Lat Long Location
0,FRESHII,Routine,10/31/2018,96,2414,VICTORY PARK,,LN,,2414 VICTORY PARK LN,75219,,Oct 2018,FY2019,"2414 VICTORY PARK LN\n(32.787625, -96.809294)"
1,MICKLE CHICKEN,Routine,10/30/2019,100,3203,CAMP WISDOM,W,RD,,3203 W CAMP WISDOM RD,75237,,Oct 2019,FY2020,"3203 W CAMP WISDOM RD\n(32.662584, -96.873446)"


### Getting matches to dem dict

In [1206]:
# using the DBA Name as the key
restaurant_matches_dal = {key.lower(): None for key in unique_dallas_data['Restaurant Name'] if type(key) == str}

In [1207]:
for key, value in restaurant_matches_dal.items():
    if type(key) == str:
        split_item = key.split()
        for word in split_item:
            for key1, value1 in demonym_dictionary.items():
                for city in value1:
                    if word.lower() == city:
                        value_info = [key1, city]
                        restaurant_matches_dal[key] = value_info

In [1208]:
bad_keys_dal = []
for key, value in restaurant_matches_dal.items():
    if value == None:
        bad_keys_dal.append(key)

In [1209]:
len(bad_keys_dal)

7084

In [1210]:
len(restaurant_matches_dal)

7472

In [1211]:
list(restaurant_matches_dal.items())[0:3]

[('freshii', None),
 ('mickle chicken', None),
 ('world trade center market', None)]

### Removing restaurants with no matches

In [1212]:
for x in bad_keys_dal:
    if restaurant_matches_dal.get(x) != None:
        del restaurant_matches_dal[x]
    elif restaurant_matches_dal.get(x) == None:
        del restaurant_matches_dal[x]

In [1213]:
len(list(restaurant_matches_dal.items()))

388

### Removing Bad Matches

In [1214]:
bad_keys_dal2 = []
for key, value in restaurant_matches_dal.items():
    if value != None:
        if value[1] in bad_matches:
            bad_keys_dal2.append(key)

In [1215]:
# removing bad keys
for x in bad_keys_dal2:
    if restaurant_matches_dal.get(x) != None:
        del restaurant_matches_dal[x]
    elif restaurant_matches_dal.get(x) == None:
        del restaurant_matches_dal[x]

In [1216]:
len(restaurant_matches_dal)

140

In [1217]:
restaurant_matches_dal

{'la michoacana grocery/produce': ['Michoacan de Ocampo', 'michoacana'],
 'acapulco': ['Tamaulipas', 'acapulco'],
 'cathedral santuario de guadalupe': ['Chihuahua', 'guadalupe'],
 'la michoacana meat mkt': ['Michoacan de Ocampo', 'michoacana'],
 'supermercado teloloapan #17, inc (grocery)': ['Guerrero', 'teloloapan'],
 'la michoacana meat mkt/carniceria': ['Michoacan de Ocampo', 'michoacana'],
 'mariscos la riviera nayarit': ['Nayarit', 'nayarit'],
 'la michoacana meat market- bakery': ['Michoacan de Ocampo', 'michoacana'],
 'martinez elem school': ['Durango', 'martinez'],
 'la michoacana meat market - taqueria': ['Michoacan de Ocampo', 'michoacana'],
 'mi lindo oaxaca (fl 1)': ['Oaxaca', 'oaxaca'],
 'taqueria fondita celaya': ['Veracruz', 'celaya'],
 'el pollo regio': ['Nuevo Leon', 'regio'],
 'paleteria neveria la china poblana': ['Puebla', 'poblana'],
 'el molino tortilleria': ['Yucatan', 'molino'],
 'monterrey restaurant & bbq': ['Campeche', 'monterrey'],
 'charco broiler steak #2'

### Getting the number of restaurants for each region

In [1218]:
regions_represented_dal = []
for key, value in restaurant_matches_dal.items():
    if value[0] not in regions_represented_dal:
        regions_represented_dal.append(value[0])

In [1219]:
# getting the count of values
region_dict_dal = {key: 0 for key in regions_represented_dal}

In [1220]:
regions_represented_dal[0:10]

['Michoacan de Ocampo',
 'Tamaulipas',
 'Chihuahua',
 'Guerrero',
 'Nayarit',
 'Durango',
 'Oaxaca',
 'Veracruz',
 'Nuevo Leon',
 'Puebla']

In [1221]:
for key, value in restaurant_matches_dal.items():
    if value[0] in region_dict_dal.keys():
        region_dict_dal[value[0]] += 1

In [1222]:
region_dict_dal

{'Michoacan de Ocampo': 30,
 'Tamaulipas': 2,
 'Chihuahua': 2,
 'Guerrero': 10,
 'Nayarit': 1,
 'Durango': 2,
 'Oaxaca': 2,
 'Veracruz': 12,
 'Nuevo Leon': 5,
 'Puebla': 1,
 'Yucatan': 6,
 'Campeche': 14,
 'Jalisco': 6,
 'Guanajuato': 2,
 'Morelos': 4,
 'Baja California Sur': 3,
 'Mexico': 7,
 'Quintana Roo': 1,
 'Hidalgo': 8,
 'Baja California': 3,
 'San Luis Potosi': 3,
 'Tabasco': 5,
 'Zacatecas': 1,
 'Tlaxcala': 1,
 'Sinaloa': 3,
 'Ciudad de Mexico': 2,
 'Chiapas': 3,
 'Sonora': 1}

### Getting Counts as percentages

In [1223]:
region_dict_dal_pct = {key:round(value/len(restaurant_matches_dal), 2) 
                       for key, value in region_dict_dal.items()}

In [1224]:
region_dict_dal_pct

{'Michoacan de Ocampo': 0.21,
 'Tamaulipas': 0.01,
 'Chihuahua': 0.01,
 'Guerrero': 0.07,
 'Nayarit': 0.01,
 'Durango': 0.01,
 'Oaxaca': 0.01,
 'Veracruz': 0.09,
 'Nuevo Leon': 0.04,
 'Puebla': 0.01,
 'Yucatan': 0.04,
 'Campeche': 0.1,
 'Jalisco': 0.04,
 'Guanajuato': 0.01,
 'Morelos': 0.03,
 'Baja California Sur': 0.02,
 'Mexico': 0.05,
 'Quintana Roo': 0.01,
 'Hidalgo': 0.06,
 'Baja California': 0.02,
 'San Luis Potosi': 0.02,
 'Tabasco': 0.04,
 'Zacatecas': 0.01,
 'Tlaxcala': 0.01,
 'Sinaloa': 0.02,
 'Ciudad de Mexico': 0.01,
 'Chiapas': 0.02,
 'Sonora': 0.01}

### Getting Values that matches

In [1225]:
dal_values = {value[1]:0 for key, value in restaurant_matches_dal.items()}

In [1226]:
for key, value in restaurant_matches_dal.items():
    if value[1] in dal_values.keys():
        dal_values[value[1]] += 1

In [1227]:
# https://careerkarma.com/blog/python-sort-a-dictionary-by-value/
sort_values_dal = sorted(dal_values.items(), key=lambda x: x[1], reverse=True)
for i in sort_values_dal[0:5]:
    print(i[0], i[1])

michoacana 28
monterrey 11
regio 5
jalisco 5
rodeo 4


## San Francisco 

In [1228]:
sf_data = pd.read_csv("./food_inspections_data/san_fran_Restaurant_Scores_-_LIVES_Standard.csv")

In [1229]:
sf_data.head(2)

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,...,inspection_type,violation_id,violation_description,risk_category,Neighborhoods (old),Police Districts,Supervisor Districts,Fire Prevention Districts,Zip Codes,Analysis Neighborhoods
0,69618,Fancy Wheatfield Bakery,1362 Stockton St,San Francisco,CA,94133,,,,,...,Complaint,69618_20190304_103130,Inadequate sewage or wastewater disposal,Moderate Risk,,,,,,
1,97975,BREADBELLY,1408 Clement St,San Francisco,CA,94118,,,,14157240000.0,...,Routine - Unscheduled,97975_20190725_103124,Inadequately cleaned or sanitized food contact...,Moderate Risk,,,,,,


In [1230]:
sf_data.shape

(53973, 23)

In [1231]:
sf_data_unique = sf_data.drop_duplicates(subset=['business_name'], keep='first')

In [1232]:
sf_data_unique.shape

(5775, 23)

In [1233]:
lower_biz_name = [x.lower() for x in sf_data['business_name']]

In [1234]:
# this is slightly smaller than the set of unique business names without lowering all of the characters (by 1000)
len(set(lower_biz_name)) 

5672

### Getting Names of Restaurants

In [1235]:
rest_matches_sf = []
for item in set(lower_biz_name):
    split_item = item.split()
    for word in split_item:
        for key, value in demonym_dictionary.items():
            for city in value:
                if word == city:
                    if word not in bad_matches: # if not, the number of results is tripled
                        rest_matches_sf.append(item)

In [1236]:
unique_sf_matches = list(set(rest_matches_sf))

In [1237]:
len(unique_sf_matches)

46

In [1238]:
unique_sf_matches[0:10]

['lucky california #755',
 'la taqueria guadalajara',
 'california grocery',
 'california pizza kitchen, inc.',
 'fei tian academy of the arts california',
 'peralta',
 'bi-rite divisadero',
 'california garlic noodles',
 'el rincon yucateco',
 '23rd & guerrero liquor store']

### Getting Regions for Matches

In [1239]:
restaurant_matches_sf = {key: None for key in set(unique_sf_matches)}

In [1240]:
for key, value in restaurant_matches_sf.items():
    split_item = key.split()
    for word in split_item:
        for key1, value1 in demonym_dictionary.items():
            for city in value1:
                if word == city:
                    value_info = [key1, city]
                    restaurant_matches_sf[key] = value_info

In [1241]:
len(restaurant_matches_sf)

46

#### Removing all entries that passed filter incorrectly

In [1243]:
bad_keys = []
for key, value in restaurant_matches_nyc.items():
    if value[1] in bad_matches:
        bad_keys.append(key)

In [1244]:
len(bad_keys)

0

In [1245]:
for x in bad_keys:
    if restaurant_matches_sf.get(x) != None:
        del restaurant_matches_sf[x]

In [1246]:
if restaurant_matches_sf.get('guadalupe elementary school'):
    del restaurant_matches_sf['guadalupe elementary school'] # probably refers to a neighborhood, not something in mex.

In [1247]:
len(restaurant_matches_sf)

45

In [1248]:
restaurant_matches_sf

{'lucky california #755': ['Guanajuato', 'california'],
 'la taqueria guadalajara': ['Baja California', 'guadalajara'],
 'california grocery': ['Guanajuato', 'california'],
 'california pizza kitchen, inc.': ['Guanajuato', 'california'],
 'fei tian academy of the arts california': ['Guanajuato', 'california'],
 'peralta': ['Veracruz', 'peralta'],
 'bi-rite divisadero': ['Tabasco', 'divisadero'],
 'california garlic noodles': ['Guanajuato', 'california'],
 'el rincon yucateco': ['Yucatan', 'yucateco'],
 'california shell': ['Guanajuato', 'california'],
 '23rd & guerrero liquor store': ['Guerrero', 'guerrero'],
 'la loma produce nos. 3 & 9 inc': ['Yucatan', 'loma'],
 'taqueria gonzalez': ['Mexico', 'gonzalez'],
 'taqueria cazadores': ['Chihuahua', 'cazadores'],
 'joe & the juice 50 california': ['Guanajuato', 'california'],
 'the little chihuahua mexican': ['Mexico', 'mexican'],
 'wines of california wine bar': ['Guanajuato', 'california'],
 'taqueria cancun': ['Oaxaca', 'cancun'],
 'gra

### Getting Region Counts

In [1249]:
region_dict_sf = {value[0]: 0 for key, value in restaurant_matches_sf.items()}

In [1250]:
for key, value in restaurant_matches_sf.items():
    if value[0] in region_dict_sf.keys():
        region_dict_sf[value[0]] += 1

In [1251]:
region_dict_sf

{'Guanajuato': 18,
 'Baja California': 2,
 'Veracruz': 2,
 'Tabasco': 2,
 'Yucatan': 5,
 'Guerrero': 2,
 'Mexico': 3,
 'Chihuahua': 3,
 'Oaxaca': 1,
 'Tlaxcala': 1,
 'Durango': 2,
 'Campeche': 2,
 'Jalisco': 1,
 'Chiapas': 1}

### Getting Region Counts as Percentages

In [1252]:
region_dict_sf_pct = {value[0]: 0 for key, value in restaurant_matches_sf.items()}

In [1253]:
for key, value in region_dict_sf.items():
    region_dict_sf_pct[key] = round(value/len(restaurant_matches_sf), 2)

In [1254]:
region_dict_sf_pct

{'Guanajuato': 0.4,
 'Baja California': 0.04,
 'Veracruz': 0.04,
 'Tabasco': 0.04,
 'Yucatan': 0.11,
 'Guerrero': 0.04,
 'Mexico': 0.07,
 'Chihuahua': 0.07,
 'Oaxaca': 0.02,
 'Tlaxcala': 0.02,
 'Durango': 0.04,
 'Campeche': 0.04,
 'Jalisco': 0.02,
 'Chiapas': 0.02}

### Getting Values that matches

In [1255]:
sf_values = {value[1]:0 for key, value in restaurant_matches_sf.items()}

In [1256]:
for key, value in restaurant_matches_sf.items():
    if value[1] in sf_values.keys():
        sf_values[value[1]] += 1

In [1257]:
# https://careerkarma.com/blog/python-sort-a-dictionary-by-value/
sort_values_sf = sorted(sf_values.items(), key=lambda x: x[1], reverse=True)
for i in sort_values_sf[0:5]:
    print(i[0], i[1])

california 18
loma 3
guadalajara 2
divisadero 2
guerrero 2


# Combining All Restaurant Matches into One df

In [None]:
# rest_df has all of the NYC results

## dict data

In [1350]:
chi_reg_w_pct = pd.DataFrame.from_dict(restaurant_matches_chi, orient='index', columns = ['Region', 'Match'])
chi_reg_w_pct.sort_index(axis=0, inplace=True)
chi_reg_w_pct['City'] = 'Chicago'

dal_reg_w_pct= pd.DataFrame.from_dict(restaurant_matches_dal, orient='index', columns = ['Region', 'Match'])
dal_reg_w_pct.sort_index(axis=0, inplace=True)
dal_reg_w_pct['City'] = 'Dallas'

sf_reg_w_pct= pd.DataFrame.from_dict(restaurant_matches_sf, orient='index', columns = ['Region', 'Match'])
sf_reg_w_pct.sort_index(axis=0, inplace=True)
sf_reg_w_pct['City'] = 'SF'

all_rest_df = pd.concat([chi_reg_w_pct, dal_reg_w_pct, sf_reg_w_pct], sort=True)

In [1362]:
all_rest_df.head(2)

Unnamed: 0,City,Match,Region
chilango,Chicago,chilango,Ciudad de Mexico
los alamos,Chicago,alamos,Tabasco


## NYC data

In [1354]:
rest_df['City'] = 'NYC'

In [1359]:
rest_df.head()

Unnamed: 0_level_0,Region,Matching_word_w_region,City
Restaurant_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
estrellita poblana,Puebla,poblana,NYC
estrellita poblana taqueria express,Puebla,poblana,NYC
coszcal de allende restaurant,Yucatan,allende,NYC
chilpancingo restaurant,Guerrero,chilpancingo,NYC
mama puebla,Puebla,puebla,NYC


In [1358]:
rest_df.set_index(keys=['Restaurant_name'], inplace=True)

In [1360]:
rest_df.columns = ['Region', 'Match', 'City']

In [1363]:
rest_df.columns

Index(['Region', 'Match', 'City'], dtype='object')

## Merging the two dfs

In [1373]:
all_restaurants = pd.concat([all_rest_df,rest_df])

In [1374]:
all_restaurants

Unnamed: 0,City,Match,Region
chilango,Chicago,chilango,Ciudad de Mexico
los alamos,Chicago,alamos,Tabasco
ocotlan,Chicago,ocotlan,Jalisco
tecalitlan,Chicago,tecalitlan,Jalisco
teloloapan,Chicago,teloloapan,Guerrero
...,...,...,...
tacos cuautla morales,NYC,morales,Veracruz
los hermanos salazar restaurant,NYC,salazar,Quintana Roo
acapulco deli & restaurant,NYC,acapulco,Tamaulipas
regalo de juquila -2,NYC,juquila,Baja California Sur


## Saving df

In [1376]:
# all_restaurants.to_csv("./my_saved_data/Final_restaurant_list.csv")

# combining all pct results to a df 

- nyc_region_counts_dict_pct
- chicago_regions_pct
- region_dict_dal_pct
- region_dict_sf_pct

### Making them all have the same length

In [1258]:
regions_used = []

In [1259]:
def track_regions_found(dictionary, regions_used):
    for key, value in dictionary.items():
        if key not in regions_used:
            regions_used.append(key)
    return regions_used

In [1260]:
regions_used1 = track_regions_found(nyc_region_counts_dict_pct, regions_used)

In [1261]:
regions_used2 = track_regions_found(chicago_regions_pct, regions_used1)

In [1262]:
regions_used3 = track_regions_found(region_dict_dal_pct, regions_used2)

In [1263]:
regions_used4 = track_regions_found(region_dict_sf_pct, regions_used3)

In [1264]:
# all but one region was found in these datasets (well 2 since michoacan was counted twice)
len(regions_used4)

31

In [1265]:
def add_keys_to_dict(dict_w_pct, regions_used = regions_used4):
    for region in regions_used:
        if region not in dict_w_pct.keys():
            dict_w_pct[region] = 0
    return dict_w_pct

In [1266]:
nyc_reg_w_pct = add_keys_to_dict(nyc_region_counts_dict_pct, regions_used4)

In [1267]:
chi_reg_w_pct = add_keys_to_dict(chicago_regions_pct, regions_used4)

In [1268]:
dal_reg_w_pct = add_keys_to_dict(region_dict_dal_pct)

In [1269]:
sf_reg_w_pct = add_keys_to_dict(region_dict_sf_pct)

### Making into df

In [1270]:
nyc_reg_w_pct_df = pd.DataFrame.from_dict(nyc_reg_w_pct, orient='index', columns = ['NYC'])
nyc_reg_w_pct_df.sort_index(axis=0, inplace=True)

In [1271]:
chi_reg_w_pct_df = pd.DataFrame.from_dict(chi_reg_w_pct, orient='index', columns = ['Chi'])
chi_reg_w_pct_df.sort_index(axis=0, inplace=True)

In [1272]:
dal_reg_w_pct_df = pd.DataFrame.from_dict(dal_reg_w_pct, orient='index', columns = ['Dal'])
dal_reg_w_pct_df.sort_index(axis=0, inplace=True)

In [1273]:
sf_reg_w_pct_df = pd.DataFrame.from_dict(sf_reg_w_pct, orient='index', columns = ['SF'])
sf_reg_w_pct_df.sort_index(axis=0, inplace=True)

In [1274]:
reg_rep_df = pd.concat([nyc_reg_w_pct_df, chi_reg_w_pct_df, dal_reg_w_pct_df, sf_reg_w_pct_df], sort=True, axis=1)

#### Fixing 2 Misnamed columns

In [1275]:
reg_rep_df.reset_index(inplace=True)

In [1276]:
reg_rep_df['Chi'][15] = reg_rep_df['Chi'][14]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1277]:
reg_rep_df.drop(index=14, inplace=True)

In [1278]:
reg_rep_df['Chi'][20] = reg_rep_df['Chi'][21]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1279]:
reg_rep_df.drop(index=21, inplace=True)

### Getting the count of the number of restaurants for each column

In [1339]:
rest_count_dict = {}
for col in reg_rep_df.columns[1:]:
    counter = 0
    for item in reg_rep_df[col]:
        if item > 0:
            counter += 1
    rest_count_dict[col] = counter   

In [1340]:
# despite all cities having more than 1.3 million people, 
# there is a vastly different number of mex. restaurants in each that have names referring to a city or region in mex.
rest_count_dict

{'NYC': 20, 'Chi': 5, 'Dal': 27, 'SF': 14}

#### Saving restaurant count dict

In [1342]:
# with open ('restaurant_counts.pickle', 'wb+') as f:
#     pickle.dump(rest_count_dict, f)

# Conclusions

All 4 cities have large Mexican immigrant populations (250k and above according to: https://247wallst.com/economy/2017/01/27/us-cities-with-the-most-mexican-immigrants/). Despite this surface similarity, it turns out that these immigrants do not all come from similar regions in Mexico. No region was in the top 5 most popular region for more than 2 of these cities. This suggests that immigration from Mexico was not dominated by one region but came from all regions of Mexico. Similarly, we see that there are no single region that makes up the majority of any US city's Mexican population: in all cases, no region accounted for more than 36% of the restaurants with regions associated with their names. This did have some variation, however: in San Francisco Guanajuato accounts for 40% of the restaurants while in Dallas the most dominant group accounts for 21%.

Some shortcomings of this project are the quality of this data: both for the demonym dictionary and the inspection data. The demonym dictionary had many entries, but there were some clear errors, e.g.: the restaurant 'jalisco' in Dallas was associated with "Ciudad de Mexico" instead of the region Jalisco. Additionally, it is not clear if **all** restaurants in each city are included in this data, or only restaurants that failed the inspection. In a similar vein, some cities had data that had too low of a quality to be useful (LA). Another problem with this project is deciding when a "match" is legitimate. For example, "Pueblito" ("little town") appeared often and is a town in Queretáro, but do these restaurants refer to the former or the latter? This was decided on an ad hoc basis and was founded solely on my (non-native) discretion. Lastly, the obvious issue of immediate representation being a marker of the true number of people from a certain region is certainly flawed. Just because there are say, a lot of Italian restaurants in a certain area, does not necessarily mean that there are many italian (recent) immigrants to that area. The popularity of food from certain regions may skew the data. 

Anecdotally, it appears that chain migration happens at a micro level. "Tulcingo" appears 5 times in the NYC data despite the fact that the town that it refers to in Mexico, Tulcingo del Valle, has only 9,000 people living in it (source: https://es.wikipedia.org/wiki/Municipio_de_Tulcingo)! 