# Pull location names from headlines

In [28]:
import os
import re
import unidecode

import pandas as pd
import geonamescache

In [29]:
def clean_text(txt):
    return unidecode.unidecode(txt).lower()

In [30]:
# Read in data
headlines_file = os.path.join("data", "headlines.txt")

with open(headlines_file) as file:
    lines = [clean_text(line.strip()) for line in file]

In [31]:
len(lines)

650

In [32]:
[print(line) for line in lines[:20]];

zika outbreak hits miami
could zika reach new york city?
first case of zika in miami beach
mystery virus spreads in recife, brazil
dallas man comes down with case of zika
trinidad confirms first zika case
zika concerns are spreading in houston
geneve scientists battle to find cure
the cdc in atlanta is growing worried
zika infested monkeys in sao paulo
brownsville teen contracts zika virus
mosquito control efforts in st. louis take new tactics with zika threat
san juan reports 1st u.s. zika-related death amid outbreak
flu outbreak in galveston, texas
zika alert - manila now threatened
zika afflicts 7 in iloilo city
new los angeles hairstyle goes viral
louisiana zika cases up to 26
orlando volunteers aid zika research
zika infects pregnant woman in cebu


**Potential problems**  
 - Locations at different levels in the taxonomy - Orlando/Florida/US
 - Multiple locations in headline
 - Split names - New York
 - Case (re.IGNORECASE)
 - Punctuation
 - Some aren't virus' i.e. Hairstyle goes Viral
 - Misspellings, non standard names
 

In [33]:
# Get location name lists
gc = geonamescache.GeonamesCache()

city_data = gc.get_cities()
cities = [clean_location_name(city_data[city_id]['name']) for city_id in city_data]
cities = list(set(cities))  # ensure unique

countries = gc.get_countries()
countries = [clean_location_name(country) for country in gc.get_countries_by_names()]
countries = list(set(countries))

In [34]:
len(countries), len(cities)

(252, 23022)

In [35]:
# Get regex
city_list = '|'.join(cities)
city_regex = re.compile(r'\b(' + r'{}'.format(city_list) + r')\b', flags=re.IGNORECASE)

country_list = '|'.join(countries)
country_regex = re.compile(r'\b(' + country_list + r')\b', flags=re.IGNORECASE)

In [36]:
def find_location(line, location_regex):
    # Return the longest matching word from the regex
    try:
        match_list = location_regex.findall(line)
        current_best = max(match_list, key=len)
        if isinstance(current_best, str):
            return current_best
        else:
            return max(list(sum(match_list, ())), key=len)
    except:
        return None

In [37]:
# Create a dataframe
df = pd.DataFrame(lines, columns=['headline'])
df.head()

Unnamed: 0,headline
0,zika outbreak hits miami
1,could zika reach new york city?
2,first case of zika in miami beach
3,"mystery virus spreads in recife, brazil"
4,dallas man comes down with case of zika


In [38]:
# Add in the city and country columns
df['city'] = df.apply(lambda x: find_location(x['headline'], city_regex), axis=1)
df['country'] = df.apply(lambda x: find_location(x['headline'], country_regex), axis=1)

In [39]:
df.head(20)

Unnamed: 0,headline,city,country
0,zika outbreak hits miami,miami,
1,could zika reach new york city?,new york city,
2,first case of zika in miami beach,miami beach,
3,"mystery virus spreads in recife, brazil",recife,brazil
4,dallas man comes down with case of zika,dallas,
5,trinidad confirms first zika case,trinidad,
6,zika concerns are spreading in houston,houston,
7,geneve scientists battle to find cure,geneve,
8,the cdc in atlanta is growing worried,atlanta,
9,zika infested monkeys in sao paulo,sao paulo,
