import re
import pandas as pd
import pickle
# ^^^ pyforest auto-imports - don't write above this line


**Can we determine the regions from which Mexican immigrants came to NYC through the traces left in restaurant/tacqueria/deli names?**

# Imports 

In [443]:
# not sure if I need this since I have pyforest installed
import time
import requests
import folium
import unidecode

# Making Dictionary to Compare Restaurant Names to

## Simple Maps DataBase

In [444]:
simple_maps = pd.read_csv("./demonym_city_data/mx_simple_maps.csv")

<IPython.core.display.Javascript object>

In [445]:
simple_maps.shape

(1189, 9)

In [446]:
simple_maps.head(2)

Unnamed: 0,city,lat,lng,country,iso2,admin,capital,population,population_proper
0,Mexico City,19.434167,-99.138611,Mexico,MX,Ciudad de México,primary,19028000.0,10811002.0
1,Guadalajara,20.666823,-103.391824,Mexico,MX,Jalisco,admin,4198000.0,1640589.0


In [447]:
regions = list(set(simple_maps.admin.sort_values()))

## Initializing Mexico_names dictionary 
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string

In [448]:
regions[0:2]

['Michoacán de Ocampo', 'Querétaro']

In [449]:
mexico_regional_names_dict = {unidecode.unidecode(x):[] for x in regions} # removing any accents

### Adding simple maps data to dict

In [450]:
for x in range(simple_maps.shape[0]):
    region = unidecode.unidecode(simple_maps['admin'][x])
    if simple_maps['city'][x] not in mexico_regional_names_dict[region]:
        mexico_regional_names_dict[region].append(simple_maps['city'][x])

## Geonames Data 
https://public.opendatasoft.com/explore/dataset/geonames-all-cities-with-a-population-1000/export/?disjunctive.country&refine.timezone=America%2FMazatlan

In [451]:
all_geonames = pd.read_csv("./demonym_city_data/geonames_data/all_geonames_cities.csv", sep=';')

<IPython.core.display.Javascript object>

In [452]:
all_geonames.shape

(136849, 21)

In [453]:
mex_geonames = all_geonames[all_geonames['Country'] == 'Mexico']

In [454]:
mex_geonames.reset_index(inplace=True, drop=True)

In [455]:
mex_geonames.shape

(8984, 21)

In [456]:
mex_geonames['Admin1 Code'].nunique()

32

In [457]:
mex_geonames['Admin1 Code'].unique()

array(['21', '04', '12', '30', '15', '17', '09', '32', '25', '07', '02',
       '16', '08', '18', '14', '26', '24', '01', '19', '11', '06', '10',
       '22', '03', '13', '29', '27', '05', '28', '31', '20', '23'],
      dtype=object)

### Making region names strings
https://mainfacts.com/world-countries-capitals-cities-codes/MX-MEX-Mexico

In [458]:
mex_geonames['Admin1 Code'].dtypes

dtype('O')

In [459]:
mex_geonames['Admin1 Code'] = mex_geonames['Admin1 Code'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [460]:
# mex_geonames.dtypes

In [461]:
mex_geonames['region'] = [regions[x-1] for x in mex_geonames['Admin1 Code']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [462]:
mex_geonames.head(2)

Unnamed: 0,Geoname ID,Name,ASCII Name,Alternate Names,Latitude,Longitude,Feature Class,Feature Code,Country Code,Country Code 2,...,Admin3 Code,Admin4 Code,Population,Elevation,DIgital Elevation Model,Timezone,Modification date,Country,Coordinates,region
0,3818920,Tlapanalá,Tlapanala,"Tlapanala,Tlapanalá,Tlapenala,Tlapenalá",18.69585,-98.53561,P,PPLA2,MX,,...,,,2727,,1414,America/Mexico_City,2018-11-03,Mexico,"18.69585,-98.53561",Puebla
1,3820847,Carrillo Puerto,Carrillo Puerto,,19.094,-90.52279,P,PPL,MX,,...,,,2829,,64,America/Merida,2018-11-03,Mexico,"19.094,-90.52279",Coahuila de Zaragoza


## Adding geonames to dictionary

### Making sure region names match

In [463]:
# checking to make sure that all region names from mex_geonames.region is in dict
for x in list(mex_geonames.region.unique()):
    if unidecode.unidecode(x) not in list(mexico_regional_names_dict.keys()):
        print(x)
# mexico_regional_names_dict

### Adding names to dict

In [464]:
mex_geonames['Alternate Names'].dtype

dtype('O')

In [465]:
for x in range(mex_geonames.shape[0]):
    region = unidecode.unidecode(mex_geonames['region'][x])
    if mex_geonames['Name'][x] not in mexico_regional_names_dict[region]:
        mexico_regional_names_dict[region].append(mex_geonames['Name'][x])
    if mex_geonames['ASCII Name'][x] not in mexico_regional_names_dict[region]:
        mexico_regional_names_dict[region].append(mex_geonames['ASCII Name'][x])
    if type(mex_geonames['Alternate Names'][x]) == str:
        alt_names_list = mex_geonames['Alternate Names'][x].split(',')
        for word in alt_names_list:
            if word not in mexico_regional_names_dict[region]:
                mexico_regional_names_dict[region].append(word)

In [466]:
# count of how many values are in the dict
count_of_values = 0
for key, value in mexico_regional_names_dict.items():
    for item in value:
        count_of_values += 1

In [467]:
count_of_values

20370

## Mexican Cities (world_cities)

In [468]:
cities = pd.read_csv("./demonym_city_data/world-cities-master/data/world-cities.csv")

<IPython.core.display.Javascript object>

In [469]:
mex_city = cities[cities['country'] == 'Mexico']

In [470]:
mex_city.drop(columns = ['geonameid'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [471]:
mex_city.columns = ['city', 'country', 'region']

In [472]:
mex_city.reset_index(inplace= True, drop =True)

In [473]:
mex_city.head()

Unnamed: 0,city,country,region
0,Gustavo A. Madero,Mexico,Tamaulipas
1,San Fernando,Mexico,Tamaulipas
2,Zumpango,Mexico,México
3,Zumpango del Río,Mexico,Guerrero
4,Zacualtipán,Mexico,Hidalgo


In [474]:
mex_city.shape

(561, 3)

### Adding mexican cities to dict

In [475]:
mex_city['city'].dtype

dtype('O')

In [476]:
mexico_regional_names_dict.keys()

dict_keys(['Michoacan de Ocampo', 'Queretaro', 'San Luis Potosi', 'Coahuila de Zaragoza', 'Jalisco', 'Nuevo Leon', 'Morelos', 'Zacatecas', 'Aguascalientes', 'Guanajuato', 'Veracruz', 'Tamaulipas', 'Colima', 'Baja California', 'Sinaloa', 'Guerrero', 'Quintana Roo', 'Ciudad de Mexico', 'Campeche', 'Baja California Sur', 'Puebla', 'Tlaxcala', 'Oaxaca', 'Durango', 'Chiapas', 'Tabasco', 'Hidalgo', 'Mexico', 'Nayarit', 'Yucatan', 'Sonora', 'Chihuahua'])

In [477]:
# special cases are for the differences in region names
unused_cities = [] # used to check if this data is actually useful
for x in range(mex_city.shape[0]):
    region = unidecode.unidecode(mex_city['region'][x])
    word = mex_city['city'][x]
    if region == 'Mexico City':
        if word not in mexico_regional_names_dict['Ciudad de Mexico']:
            mexico_regional_names_dict['Ciudad de Mexico'].append(word)
    elif region == 'Michoacan':
        if word not in mexico_regional_names_dict['Michoacan de Ocampo']:
            mexico_regional_names_dict['Michoacan de Ocampo'].append(word)
    elif region == 'Coahuila':
        if word not in mexico_regional_names_dict['Michoacan de Ocampo']:
            mexico_regional_names_dict['Michoacan de Ocampo'].append(word)
    elif word not in mexico_regional_names_dict[region]:
        mexico_regional_names_dict[region].append(word)
        unused_cities.append(word)

In [478]:
# about half of the entries were not found already in the dict 
len(unused_cities)

255

## Mexican Demonyms

This was found using the table found in this wikipedia page: https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_of_place_names#States_of_Mexico. Then I entered that link into https://wikitable2csv.ggor.de and downloaded the table as a csv

In [479]:
mex_demonyms = pd.read_csv("./demonym_city_data/mexican_demonyms.csv", skip_blank_lines=True, skiprows = [1])

<IPython.core.display.Javascript object>

In [480]:
mex_demonyms.reset_index(inplace=True)

In [481]:
mex_demonyms.drop(index=32, inplace=True)

In [482]:
# no useful info from demonym.1
mex_demonyms.drop(columns=['Demonym.1'], inplace=True)

In [483]:
mex_demonyms.columns = ['region_in_spanish', "region_in_english", "adjective", "demonym"]

### Getting all the names into a single list

#### Fixing items in "region_in_spanish"

In [484]:
mex_demonyms['region_in_spanish'][14] = "Ciudad de México"

In [485]:
mex_demonyms['region_in_spanish'][15] = 'Michoacan de Ocampo'

In [486]:
mex_demonyms.head(2)

Unnamed: 0,region_in_spanish,region_in_english,adjective,demonym
0,Aguascalientes,Hydrocalid,Hidrocálido/-a / aguascalentense,
1,Baja California,Lower Californian,Bajacaliforniano/-a,


#### Making list that will host all of the demonyms

In [487]:
# once combined with the previous dataframe, we will have a dictionary with each region having a list of names that could refer to it
demonyms = [[x] for x in mex_demonyms['region_in_english'] ]

#### Making each cell only have one term

In [488]:
demonyms[0].append("aguascalentense")

In [489]:
mex_demonyms['adjective'][0] = "Hidrocálido/-a"

In [490]:
demonyms[12].append('Jalisquillo')

In [491]:
mex_demonyms['demonym'][12] = "Tapatio/ Tapatia"

In [492]:
# all of the regions_in_english are in the correct format
demonyms[20] = ['Pueblan', 'Poblano']

In [493]:
# mex_demonyms['adjective'][0] = "Hidrocálido/-a"
mex_demonyms['demonym'].replace(np.nan, '', regex=True, inplace=True)

In [494]:
## Making all of the -a names to be a seperate entry
for x in range(0, len(mex_demonyms)):
    multi_adjective = re.match(r"(\w+)[o]\/(?=(\-?|\s|\s\-)a)(.+)", mex_demonyms['adjective'][x])
    if multi_adjective:
        mex_demonyms['adjective'][x] = multi_adjective.group(1) + "o" + '/' \
        + multi_adjective.group(1) + "a"
    multi_dem = re.match(r"\"?(\w+)[o]\/(?=(\-?|\s|\s\-)a)", mex_demonyms['demonym'][x])
    if multi_dem:
        mex_demonyms['demonym'][x] = multi_dem.group(1) + "o" + '/' \
        + multi_dem.group(1) + "a"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [495]:
# getting all of items to either be a single name or single name/-a
for x in range(0, len(mex_demonyms)):
    split_adj = mex_demonyms['adjective'][x].split("/")
    for y in split_adj:
        demonyms[x].append(y)
    split_dem = mex_demonyms['demonym'][x].split("/")
    for z in split_dem:
        a = z.strip('""')
        demonyms[x].append(a)

In [496]:
# getting rid of any empty entries
for x in demonyms:
    for y in x:
        if y == "":
            x.remove(y)

In [497]:
demonyms[0:3]

[['Hydrocalid', 'aguascalentense', 'Hidrocálido', 'Hidrocálida'],
 ['Lower Californian', 'Bajacaliforniano', 'Bajacaliforniana'],
 ['South Lower Californian', 'Sudcaliforniano', 'Sudcaliforniana']]

## Combining the demonym list to the dictionary

### Adding the regions to each demonym in demonym list

In [498]:
demonyms_w_region = list(zip(mex_demonyms['region_in_spanish'], demonyms))

In [499]:
demonyms_w_region[0]

('Aguascalientes',
 ['Hydrocalid', 'aguascalentense', 'Hidrocálido', 'Hidrocálida'])

In [500]:
mexico_regional_names_dict.keys()

dict_keys(['Michoacan de Ocampo', 'Queretaro', 'San Luis Potosi', 'Coahuila de Zaragoza', 'Jalisco', 'Nuevo Leon', 'Morelos', 'Zacatecas', 'Aguascalientes', 'Guanajuato', 'Veracruz', 'Tamaulipas', 'Colima', 'Baja California', 'Sinaloa', 'Guerrero', 'Quintana Roo', 'Ciudad de Mexico', 'Campeche', 'Baja California Sur', 'Puebla', 'Tlaxcala', 'Oaxaca', 'Durango', 'Chiapas', 'Tabasco', 'Hidalgo', 'Mexico', 'Nayarit', 'Yucatan', 'Sonora', 'Chihuahua'])

In [501]:
# adding demonyms
for x in demonyms_w_region:
    region = unidecode.unidecode(x[0]) # losing any accents
    if region in mexico_regional_names_dict:
        for y in x[1]:
            if y not in mexico_regional_names_dict[region]:
                mexico_regional_names_dict[region].append(y)
    elif region == 'State of Mexico':
        for y in x[1]:
            if y not in mexico_regional_names_dict["Mexico"]:
                mexico_regional_names_dict["Mexico"].append(y)

### Adding the state itself to the values 

In [502]:
for key, value in mexico_regional_names_dict.items():
    if key not in mexico_regional_names_dict[key]:
        mexico_regional_names_dict[key].append(key)
        print(key) # mostly works because the names aren't spelled as they would be in spanish (with accents)

Michoacan de Ocampo
Queretaro
San Luis Potosi
Coahuila de Zaragoza
Nuevo Leon
Tamaulipas
Baja California
Sinaloa
Guerrero
Quintana Roo
Ciudad de Mexico
Baja California Sur
Chiapas
Tabasco
Hidalgo
Mexico
Nayarit
Yucatan
Sonora


## Making all values in dem_dict to be lowercase 

In [503]:
mexico_regional_names_dict_lower = {}
for key, value in mexico_regional_names_dict.items():
    mexico_regional_names_dict_lower[key] = []
    for y in value:
        # I had an issue that the last item of each key was a sequence of lists
        if type(y) == str:
            lower_y = y.lower()
            mexico_regional_names_dict_lower[key].append(lower_y)

### Getting count of values in mexico_regional_names_dict_lower

In [504]:
counter = 0
for key, value in mexico_regional_names_dict_lower.items():
    for thing in value:
        counter += 1

In [505]:
counter

20786

## Removing values that are names of other regions

### Removing the values

In [523]:
for key in mexico_regional_names_dict_lower.keys():
    lower_key = key.lower()
    temp_keys = list(mexico_regional_names_dict_lower.keys())
    temp_keys.remove(key)
    for item in temp_keys:
        if key in mexico_regional_names_dict_lower[item]:
            mexico_regional_names_dict_lower[item].remove(key)
        if lower_key in mexico_regional_names_dict_lower[item]:
            mexico_regional_names_dict_lower[item].remove(lower_key)

In [521]:
# losing 66 problematic values
counter = 0
for key, value in mexico_regional_names_dict_lower.items():
    for thing in value:
        counter += 1
counter

20720

In [516]:
'jalisco' in mexico_regional_names_dict_lower['Ciudad de Mexico']

True

In [517]:
mexico_regional_names_dict_lower['Ciudad de Mexico'].remove('jalisco')

In [518]:
'jalisco' in mexico_regional_names_dict_lower['Ciudad de Mexico']

False

## Saving Dictionary

In [525]:
# with open ('demonym_dictionary.pickle', 'wb+') as f:
#     pickle.dump(mexico_regional_names_dict_lower, f)