import numpy as np
# ^^^ pyforest auto-imports - don't write above this line


Goal: **Can we determine the regions from which Mexican immigrants came to NYC through the traces left in restaurant/tacqueria/deli names?**

In this notebook, I'll build the database that will connect words to regions and be used to determine the region that a restaurant is connected to. 

# Imports 

In [1]:
import unidecode
import warnings
warnings.filterwarnings('ignore')
from functions import *

# Importing Data

## Simple Maps DataBase

In [2]:
# https://simplemaps.com/data/mx-cities
simple_maps = pd.read_csv("../demonym_city_data/mx_simple_maps.csv")

simple_maps_clean_df = simple_maps[['city', 'admin']]
simple_maps_clean_df2 = change_df_names(simple_maps_clean_df, 'admin')

## Geonames Data 
https://public.opendatasoft.com/explore/dataset/geonames-all-cities-with-a-population-1000/export/?disjunctive.country&refine.timezone=America%2FMazatlan

In [3]:
all_geonames = pd.read_csv("../demonym_city_data/geonames_data/all_geonames_cities.csv", sep=';')

mex_geonames = all_geonames[all_geonames['Country'] == 'Mexico']

### We need to convert the number for Admin1_Code to a region

In [4]:
# using simple maps will give us the key
regions = list(set(simple_maps.admin.sort_values()))

mex_geonames['Admin1 Code'] = mex_geonames['Admin1 Code'].astype(int)
mex_geonames['region'] = [regions[x-1] for x in mex_geonames['Admin1 Code']]

# removing name since ascii name covers it without accents
mex_geonames_clean_df = mex_geonames[['ASCII Name', 'Alternate Names','region']].reset_index(drop=True)

### Combining values into a list

In [5]:
mex_geonames_clean_df['Alt_names_split'] = [x.split(",") if type(x) == str else [] 
                                            for x in mex_geonames_clean_df['Alternate Names']]

mex_geonames_clean_df['ascii_names_list'] = [x.split(",") 
                                            for x in mex_geonames_clean_df['ASCII Name']]

mex_geonames_clean_df['all_names_split'] = mex_geonames_clean_df['ascii_names_list'] + mex_geonames_clean_df['Alt_names_split']

mex_geonames_clean_df['all_names_split_final'] = [list(set(x)) for x in mex_geonames_clean_df['all_names_split']]
mex_geonames_clean_df['all_names_split_unique'] = [list(set(x)) for x in mex_geonames_clean_df['all_names_split_final']]

### Final mex geonames df

In [6]:
# mex_geonames_clean_df = 
mex_geonames_clean_df.drop(columns = ['ASCII Name', 'Alternate Names', 'Alt_names_split',
                                     'ascii_names_list', 'all_names_split', 'all_names_split_final'], inplace=True)

In [7]:
# mex_geonames_clean_df['region'] = [unidecode.unidecode(x) for x in mex_geonames_clean_df['region']]
mex_geonames_clean_df2 = change_df_names(mex_geonames_clean_df, 'region')

## Mexican Cities (world_cities)

In [8]:
cities = pd.read_csv("../demonym_city_data/world-cities-master/data/world-cities.csv")

mexican_cities = cities[cities['country'] == 'Mexico']
mexican_cities_clean_df = mexican_cities[['name', 'subcountry']]
mexican_cities_clean_df2 = change_df_names(mexican_cities_clean_df, 'subcountry')

## Mexican Demonyms

This was found using the table found in this wikipedia page: https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_of_place_names#States_of_Mexico. Then I entered that link into https://wikitable2csv.ggor.de and downloaded the table as a csv

In [9]:
mex_demonyms = pd.read_csv("../demonym_city_data/mexican_demonyms.csv", skip_blank_lines=True, skiprows = [1])
mex_demonyms.reset_index(inplace=True)
mex_demonyms.drop(index=32, inplace=True)
mex_demonyms.columns = ['region', "region_demonym", "adjective", "demonym", 'Demonym.1']

### Getting all the names into a single list

#### Fixing items in "region_in_spanish"

In [10]:
mex_demonyms['region'][14] = "Ciudad de México"

#### Making list that will host all of the demonyms

In [11]:
# once combined with the previous dataframe, we will have a dictionary with each region having a list of names that could refer to it
demonyms = [[x] for x in mex_demonyms['region_demonym'] ]

#### Making each cell only have one term

In [12]:
demonyms[0].append("aguascalentense")

mex_demonyms['adjective'][0] = "Hidrocálido/-a"

demonyms[12].append('Jalisquillo')

mex_demonyms['demonym'][12] = "Tapatio/ Tapatia"

# all of the regions_in_english are in the correct format
demonyms[20] = ['Pueblan', 'Poblano']

# mex_demonyms['adjective'][0] = "Hidrocálido/-a"
mex_demonyms['demonym'].replace(np.nan, '', regex=True, inplace=True)

<IPython.core.display.Javascript object>

In [13]:
## Making all of the -a names to be a seperate entry
for x in range(0, len(mex_demonyms)):
    multi_adjective = re.match(r"(\w+)[o]\/(?=(\-?|\s|\s\-)a)(.+)", mex_demonyms['adjective'][x])
    if multi_adjective:
        mex_demonyms['adjective'][x] = multi_adjective.group(1) + "o" + '/' \
        + multi_adjective.group(1) + "a"
    multi_dem = re.match(r"\"?(\w+)[o]\/(?=(\-?|\s|\s\-)a)", mex_demonyms['demonym'][x])
    if multi_dem:
        mex_demonyms['demonym'][x] = multi_dem.group(1) + "o" + '/' \
        + multi_dem.group(1) + "a"

In [14]:
# getting all of items to either be a single name or single name/-a
for x in range(0, len(mex_demonyms)):
    split_adj = mex_demonyms['adjective'][x].split("/")
    for y in split_adj:
        demonyms[x].append(y)
    split_dem = mex_demonyms['demonym'][x].split("/")
    for z in split_dem:
        a = z.strip('""')
        demonyms[x].append(a)

In [15]:
# making the demonyms data only have unique entries
unique_demonyms = [list(set(item)) for item in demonyms]

#### Making the demonyms connect to a region

In [16]:
demonyms_w_region = list(zip(mex_demonyms['region'], demonyms))
demonyms_w_regions_df = pd.DataFrame(data = demonyms_w_region, columns=['State', 'Demonyms'])
demonyms_w_regions_df2 = change_df_names(demonyms_w_regions_df, 'State')

# Initializing Mexico_names dictionary 

In [17]:
english_region_names = list(mexican_cities_clean_df.region.unique())
mexico_regional_names_dict = {unidecode.unidecode(x):[] for x in english_region_names} # removing any accents

## Checking that region names match

In [18]:
bad_matches_simple_maps = match_region_names(mexico_regional_names_dict, simple_maps_clean_df2)
bad_matches_simple_maps

['Ciudad de Mexico', 'Coahuila de Zaragoza', 'Michoacan de Ocampo']

In [19]:
bad_matches_mex_geonames_clean_df = match_region_names(mexico_regional_names_dict, 
                                                       mex_geonames_clean_df2)
bad_matches_mex_geonames_clean_df

['Coahuila de Zaragoza', 'Michoacan de Ocampo', 'Ciudad de Mexico']

In [20]:
bad_matches_mexican_cities_clean_df = match_region_names(mexico_regional_names_dict, 
                                                       mexican_cities_clean_df2)
bad_matches_mexican_cities_clean_df

[]

In [21]:
bad_matches_demonyms_w_regions_df = match_region_names(mexico_regional_names_dict, 
                                                       demonyms_w_regions_df2)
bad_matches_demonyms_w_regions_df

['Coahuila de Zaragoza', 'State of Mexico', 'Ciudad de Mexico']

## Replacing incorrect col_values

In [22]:
simple_maps_clean_df3 = replace_col_values_in_df(simple_maps_clean_df2)
mex_geonames_clean_df3 = replace_col_values_in_df(mex_geonames_clean_df2)
mexican_cities_clean_df3 = replace_col_values_in_df(mexican_cities_clean_df2)
demonyms_w_regions_df3 = replace_col_values_in_df(demonyms_w_regions_df2)

## Adding Values to the Larger Dictionary

In [23]:
mexico_regional_names_dict = add_data_to_dictionary(simple_maps_clean_df3 ,mexico_regional_names_dict)
mexico_regional_names_dict = add_data_to_dictionary(mex_geonames_clean_df3 ,mexico_regional_names_dict)
mexico_regional_names_dict = add_data_to_dictionary(mexican_cities_clean_df3 ,mexico_regional_names_dict)
mexico_regional_names_dict = add_data_to_dictionary(demonyms_w_regions_df3 ,mexico_regional_names_dict)

In [24]:
counter = 0
for key, value in mexico_regional_names_dict.items():
    counter += len(value)

In [25]:
counter

22217

# Fixing the Larger Dictionary

In [26]:
mexico_regional_names_dict = clean_dictionary_values(mexico_regional_names_dict)

In [27]:
# getting count of items
counter1 = 0
for key, value in mexico_regional_names_dict.items():
    for thing in value:
        counter1 += 1
counter1

print(f"Cleaning the dictionary removed {counter - counter1} values")

Cleaning the dictionary removed 5739 values


## Removing entries that are ambiguous/mistakenly connected to a region

### Identifying bad matches

In [28]:
bad_matches1 = ['mexican', 'tequila', 'margarita', 'margaritas', 'mexico', 'pedro', 'azteca', 'paraiso'
'mexico','rodeo','rio','maria','mexicanos','coyote','marcos','mama','bravo','viejo','perla','nuevo'
'verde', 'gonzalez', 'corona', 'armadillo', 'arriba', 'palmas', 'delicias', 'blanco', 'crespo','tortuga']

In [29]:
bad_matches2 = ['mexican','mexico','mexicanos','azteca','esperanza','estrada','esperanzas','salero','pinos',
                  'maria', 'bravo', 'nuevo', 'progreso', 'delicias', 'comales', 'palmas', 'palenque', 'concordia',
                 'china', 'gym', 'paloma', 'rio', 'mex', 'tequila', 'colorado', 'ventana', 'lom', 'garcia', 'paz',
                 'chavez', 'paraiso', 'senor', 'oriental', 'fronteras', 'tap', 'aca', 'purisima', 'rodriguez',
                 'hernandez', 'sanchez', 'victoria', 'oasis', 'cash', 'pinas', 'yaa',  'meson','agustin',
                 'agustin', 'limon', 'alamo', 'slp', 'providencia', 'reyes', 'lom', 'verde', 'perla', 'madrid', 
                 'delta', 'mama', 'lopez', 'honey', 'laurel', 'california pizza kitchen', 'sauces', 'laguna', 
              'dolores', 'presidio', 'ver', 'bernal', 'rincon', 'marin', 'palma', 'potrero', 'mid', 'valencia',
              'aura', 'kava', 'pueblito', 'castillo', 'tam', 'marcos', 'montecristo', 'tinajas', 'alvarado',
              'porvenir', 'nieves', 'mina', 'marin', "alamos", 'reforma', 'jal', 'margaritas', 'california']

In [30]:
bad_matches3 = ['mexican', 'tequila', 'margarita', 'margaritas', 'mexico', 'pedro', 'azteca', 'paraiso'
'mexico','rodeo','rio','maria','mexicanos','coyote','marcos','mama','bravo','viejo','perla','nuevo'
'verde', 'gonzalez', 'corona', 'armadillo', 'arriba', 'palmas', 'delicias', 'blanco', 'crespo','tortuga']

In [31]:
bad_matches4 = ['mexican','mexico','mexicanos','azteca','esperanza','estrada','esperanzas','salero','pinos',
                  'maria', 'bravo', 'nuevo', 'progreso', 'delicias', 'comales', 'palmas', 'palenque', 'concordia',
                 'china', 'gym', 'paloma', 'rio', 'mex', 'tequila', 'colorado', 'ventana', 'lom', 'garcia', 'paz',
                 'chavez', 'paraiso', 'senor', 'oriental', 'fronteras', 'tap', 'aca', 'purisima', 'rodriguez',
                 'hernandez', 'sanchez', 'victoria', 'oasis', 'cash', 'pinas', 'yaa', 'meson','agustin',
                 'agustin', 'limon', 'alamo', 'slp', 'providencia', 'reyes', 'lom', 'verde', 'perla', 'madrid', 
                 'delta', 'mama', 'lopez', 'honey', 'laurel', 'california pizza kitchen', 'sauces', 'laguna', 
              'dolores', 'presidio', 'ver', 'bernal', 'rincon', 'marin', 'palma', 'potrero', 'mid', 'valencia',
              'aura', 'kava', 'pueblito', 'castillo', 'tam', 'marcos', 'montecristo', 'tinajas', 'alvarado',
              'porvenir', 'nieves', 'mina', 'marin']

In [32]:
# getting union of these lists
final_bad_matches_list = list(set(bad_matches1) | set(bad_matches2) | set(bad_matches3) | set(bad_matches4)) 

In [33]:
len(final_bad_matches_list)

104

### Actually removing the bad entries


In [34]:
for key, value in mexico_regional_names_dict.items():
    for entry in value:
        if entry in final_bad_matches_list:
            mexico_regional_names_dict[key].remove(entry)

In [35]:
# getting count of items
counter2 = 0
for key, value in mexico_regional_names_dict.items():
    counter2 += len(value)
counter2
print(f"Cleaning the dictionary removed {counter1 - counter2} values")

Cleaning the dictionary removed 159 values


## Making sure no entry is used twice

In [36]:
all_values_separate = list(mexico_regional_names_dict.values())

In [37]:
duplicates = []
for idx, sublist in enumerate(all_values_separate):
    big_list = []
    for x in range(32):
        if x != idx:
            big_list.extend(all_values_separate[x])
    sublist_duplicates = list(set(sublist) & set(big_list))
    duplicates.append(sublist_duplicates)

In [38]:
len(duplicates[25])

217

In [39]:
len(all_values_separate[25])

821

In [40]:
# only the first entry has no repeats
for idx, item in enumerate([duplicates, all_values_separate]):
    if len(item[0][idx]) == len(item[1][idx]):
        print(idx)

In [41]:
# now I'm confused
for idx, item in enumerate([duplicates, all_values_separate]):
    if len(item[0][idx]) != len(item[1][idx]):
        print(len(item[1][idx]) - len(item[0][idx]))

-16
4


## Saving Dictionary

In [46]:
with open ('../my_saved_data/demonym_dictionary.pickle', 'wb+') as f:
    pickle.dump(mexico_regional_names_dict, f)