import pickle
import pandas as pd
# ^^^ pyforest auto-imports - don't write above this line
# Imports

In [1]:
import unidecode
import wikipedia
import warnings
warnings.filterwarnings('ignore')

In [2]:
original_rest_matches_df = pd.read_csv("../my_saved_data/All_restaurant_matches_df.csv", \
                                       index_col=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

In [3]:
with open("../my_saved_data/demonym_dictionary.pickle", "rb+") as f:
    demonym_dictionary = pickle.load(f)

<IPython.core.display.Javascript object>

## Removing any mistaken entries

In [4]:
with open("../my_saved_data/incorrect_matches.pickle", "rb+") as f:
    incorrect_matches = pickle.load(f)

<IPython.core.display.Javascript object>

In [5]:
rest_matches_df = original_rest_matches_df[~original_rest_matches_df['Matched_word'].isin(incorrect_matches)\
                                          ].reset_index(drop=True)

# Checking the matched word individually

In [6]:
mult_regions_df = rest_matches_df[rest_matches_df['Multiple_regions_flag'] == True]
problematic_matched_words = list(mult_regions_df.Matched_word.unique())

In [7]:
len(problematic_matched_words)

155

In [8]:
# mult_regions_df[mult_regions_df['Matched_word'] == 'leon']

In [9]:
# problematic_matched_words

In [10]:
single_regions_df = rest_matches_df[rest_matches_df['Multiple_regions_flag'] == False]
possible_problematic_matched_words = list(set(list(single_regions_df.Matched_word.unique())))

In [11]:
# how many regions are found in my study?
single_regions_df['Region'].nunique()

18

In [12]:
# single_regions_df[single_regions_df['Matched_word'] == 'bajio']

In [13]:
# possible_problematic_matched_words

# Using Wikipedia to correct any errors in data

In [14]:
# getting all of the multi-matched words in a single list 
multi_region_matches = list(set(rest_matches_df[rest_matches_df['Multiple_regions_flag']]['Matched_word']))
len(multi_region_matches)

155

In [15]:
# getting all of the state names 
state_names = list(demonym_dictionary.keys())

In [16]:
# replacement dictionary
replacement_dict = {key: None for key in rest_matches_df['Matched_word']}

In [17]:
rest_matches_df['Region'][6].count(",")

5

In [18]:
# adding words that are clearly tied to a region 
for idx, word in enumerate(rest_matches_df['Matched_word']):
    if rest_matches_df['Region'][idx].count(",") == 0:
        replacement_dict[word] = rest_matches_df['Region'][idx]

## Getting summaries from wikipedia and sorting results

In [19]:
summaries = []
somewhat_ambiguous_entries = []
ambiguous_entries = []
not_in_wikipedia = []
for idx, match in enumerate(multi_region_matches):
    try:
        summary = wikipedia.summary(match)
        str_match = " " + match + " "
        if str_match in summary:
            summaries.append(summary)
        elif str_match not in summary:
            summaries.append(idx)
            somewhat_ambiguous_entries.append(idx)
    except wikipedia.exceptions.DisambiguationError as e:
        if match + " (state)" in e.options:
            summary = wikipedia.summary(match + " (state)")
            summaries.append(summary)
        else:
            summaries.append(idx)
            ambiguous_entries.append((idx, e.options))
    except wikipedia.exceptions.PageError as p:
        not_in_wikipedia.append((idx, match)) 

In [20]:
# not_in_wikipedia

In [21]:
# replacing the not_in_wikipedia entry 
replacement_dict['lerma'] = 'Mexico'
replacement_dict['chalmita'] = 'Mexico'

In [22]:
# how many entries are still None?
none_counter = 0
for key, value in replacement_dict.items():
    if value == None:
        none_counter += 1
none_counter

41

## Adding entries that were found in wikipedia to dictionary

In [23]:
# how many summaries were found by wikipedia?
summaries_count = [x for x in summaries if type(x) != int]
print(f"wikipedia found {len(summaries_count)} entries")

wikipedia found 7 entries


In [24]:
summaries_dict = {'corralito' : None,
                 'copal': None,
                 'poblano': 'Puebla',
                 'guerrerense': "Guerrero",
                 'candela': None,
                 'leon': 'Nuevo Leon',
                 'morelia': "Puebla",
                 'coatzingo': None}

In [25]:
# for idx, x in enumerate(summaries):
#     if type(x) == str:
#         print(idx)
#         print(f"The entry is {multi_region_matches[idx]}")
#         print(x)

In [26]:
replacement_dict.update(summaries_dict)

In [27]:
# how many entries are still None?
none_counter = 0
for key, value in replacement_dict.items():
    if value == None:
        none_counter += 1
none_counter

44

## Googling Remaining Entries Without a Region

In [28]:
google_search_entries = {}
for key, value in replacement_dict.items():
    if value:
        continue
    if key in summaries_dict.keys():
        continue
    google_search_entries[key] = None

In [29]:
# carmen cooould refer to playa del carmen, but it clearly doesnt here 
google_search_entries = {'juarez': "Chihuahua",
                        'chinantla': 'Puebla',
                        'xochimilco': 'Mexico City',
                        'coyoacan': 'Mexico City',
                        'cholula': 'Puebla',
                        'matamoros': 'Tamaulipas',
                        'acatlan': 'Hidalgo',
                        'ocotlan': "Jalisco",
                        'atotonilco': 'Guanajuato',
                        'bustamante': 'Nuevo Leon',
                        'ayutla': "Jalisco",
                        'arandas': 'Jalisco',
                        'maravatio': 'Michoacan',
                        'uruapan': 'Michoacan',
                        'cocula': "Michoacan",
                        'camargo': "Chihuahua",
                        'cuernavaca': "Morelos",
                        'polanco': 'Mexico City',
                        'taxco': "Guerrero",
                         'zaragoza': 'Coahuila',
                         'monterrey': 'Nuevo Leon',
                         'tenochtitlan': 'Mexico City',
                         'aquila': "Michoacan",
                         'tuxpan': 'Veracruz',
                         'toluca': 'Mexico',
                         'ixtapa': 'Guerrero',
                         'apatzingan': 'Michoacan',
                         'quiroga': 'Michoacan',
                         'tuzantla': 'Michoacan',
                         'ahualulco': 'San Luis Potosi',
                         'picacho': 'Guanajuato',
                         'masatlan': 'Sinaloa'}
replacement_dict.update(google_search_entries)

## Replacing/Deleting Entries from DataFrame

In [30]:
# searching for incorrect region names 
for key, value in replacement_dict.items():
    if value != None:
        if value not in state_names:
            print(f"{key} has {value} spelt incorrectly")

In [31]:
rest_matches_df

Unnamed: 0,Restaurant,City,Matched_word,Region,Multiple_regions_flag,Final_region
0,aqui en bella puebla mexican restaurant,NYC,puebla,Puebla,True,
1,san jose tulcingo deli,NYC,tulcingo,Puebla,True,
2,cuautla morelos restaurant,NYC,morelos,Morelos,True,
3,tulcingo,NYC,tulcingo,Puebla,True,
4,estrellita poblana ii,NYC,poblana,Puebla,True,
...,...,...,...,...,...,...
492,loving cup divisadero,SF,divisadero,Sonora,True,
493,el rincon yucateco,SF,yucateco,Yucatan,True,
494,la taqueria guadalajara,SF,guadalajara,Jalisco,True,
495,barranco catering,SF,barranco,San Luis Potosi,True,


In [32]:
# removing incorrect matched_words 
entries_to_remove = [key for key, value in replacement_dict.items() if value == None]
final_rest_matches_df = rest_matches_df[~rest_matches_df['Matched_word'].isin(entries_to_remove)].reset_index(drop=True)
# removed 31 entries
final_rest_matches_df.shape

(466, 6)

In [33]:
final_rest_matches_df

Unnamed: 0,Restaurant,City,Matched_word,Region,Multiple_regions_flag,Final_region
0,aqui en bella puebla mexican restaurant,NYC,puebla,Puebla,True,
1,san jose tulcingo deli,NYC,tulcingo,Puebla,True,
2,cuautla morelos restaurant,NYC,morelos,Morelos,True,
3,tulcingo,NYC,tulcingo,Puebla,True,
4,estrellita poblana ii,NYC,poblana,Puebla,True,
...,...,...,...,...,...,...
461,loving cup divisadero,SF,divisadero,Sonora,True,
462,el rincon yucateco,SF,yucateco,Yucatan,True,
463,la taqueria guadalajara,SF,guadalajara,Jalisco,True,
464,barranco catering,SF,barranco,San Luis Potosi,True,


In [34]:
# replacing the regions 
for idx, word in enumerate(final_rest_matches_df['Matched_word']):
    if word in replacement_dict.keys():
        final_rest_matches_df['Final_region'][idx] = replacement_dict[word]

In [35]:
final_rest_matches_df.head(15)

Unnamed: 0,Restaurant,City,Matched_word,Region,Multiple_regions_flag,Final_region
0,aqui en bella puebla mexican restaurant,NYC,puebla,Puebla,True,Puebla
1,san jose tulcingo deli,NYC,tulcingo,Puebla,True,Puebla
2,cuautla morelos restaurant,NYC,morelos,Morelos,True,Morelos
3,tulcingo,NYC,tulcingo,Puebla,True,Puebla
4,estrellita poblana ii,NYC,poblana,Puebla,True,Puebla
5,cienega las tlayudas de oaxaca,NYC,oaxaca,Oaxaca,True,Oaxaca
6,viva juarez restaurant,NYC,juarez,"Tamaulipas,Mexico City,Chiapas,Nuevo Leon,Mich...",False,Chihuahua
7,atlixco bakery & deli,NYC,atlixco,Puebla,True,Puebla
8,tacos el nopal,NYC,nopal,Chiapas,True,Chiapas
9,el paisa tepeaca,NYC,tepeaca,Puebla,True,Puebla


## Dropping columns

In [36]:
final_rest_matches_df.drop(columns = ['Region', 'Multiple_regions_flag'], inplace=True)

## Saving all_correct_restaurants 

In [37]:
final_rest_matches_df.to_csv("../my_saved_data/all_correct_restaurants_and_regions.csv")