Loading headline data:

In [1]:
headline_file = open('headlines.txt', 'r')
headlines = [line.strip() for line in headline_file.readlines()]
num_headlines = len(headlines)
print(f"{num_headlines} headlines have been loaded")

650 headlines have been loaded


Extract city and country names:
    1. transform each location name into a case and accent-independent regular expression

In [2]:
import re
from unidecode import unidecode

def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

    2. Create mapping between regex and original names in GeoNamesCache

In [3]:
from geonamescache import GeonamesCache

gc = GeonamesCache()
countries = [country['name'] for country in gc.get_countries().values()]
country_to_name = {name_to_regex(name): name for name in countries}

cities = [city['name'] for city in gc.get_cities().values()]
city_to_name = {name_to_regex(name): name for name in cities}

    3. Use mappings to look for location names in text and summarize data

In [6]:
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(), key=lambda x: x[1]):
        if regex.search(text):
            return name
    return None

In [8]:
import pandas as pd

matched_countries = [get_name_in_text(headline, country_to_name) for headline in headlines]
matched_cities = [get_name_in_text(headline, city_to_name) for headline in headlines]

data = {'Headline': headlines, 'City': matched_cities, 'Country': matched_countries}
df = pd.DataFrame(data)

In [9]:
summary = df[['City', 'Country']].describe()
print(summary)
# doesn't seem right!
# The 45 instances of Of are more likely to match the preposition than the rarely referenced Turkish location.

       City Country
count   619      15
unique  510      10
top      Of  Brazil
freq     45       3


In [10]:
of_cities = df[df.City == 'Of'][['City', 'Headline']]
ten_of_cities = of_cities.head(10)
print(ten_of_cities.to_string(index=False))

City                                           Headline
 Of              Case of Measles Reported in Vancouver
 Of  Authorities are Worried about the Spread of Br...
 Of  Authorities are Worried about the Spread of Ma...
 Of  Rochester authorities confirmed the spread of ...
 Of     Tokyo Encounters Severe Symptoms of Meningitis
 Of  Authorities are Worried about the Spread of In...
 Of            Spike of Pneumonia Cases in Springfield
 Of  The Spread of Measles in Spokane has been Conf...
 Of                    Outbreak of Zika in Panama City
 Of    Urbana Encounters Severe Symptoms of Meningitis


    4. Find multi-city headlines

In [14]:
def get_cities_in_headline(headline):
    cities_in_headline = set()
    for regex, name in city_to_name.items():
        match = regex.search(headline)
        if match:
            if headline[match.start()].isupper():
                cities_in_headline.add(name)
    return list(cities_in_headline)

df['Cities'] = df['Headline'].apply(get_cities_in_headline)
df['Num_cities'] = df['Cities'].apply(len)
df_multiple_cities = df[df.Num_cities > 1]
num_rows, _ = df_multiple_cities.shape
print(f"{num_rows} headlines match multiple cities")

69 headlines match multiple cities


In [17]:
ten_cities = df_multiple_cities[['Cities', 'Headline']].head(10)
print(ten_cities.to_string(index=False))

Cities                                           Headline
        [New York City, York]                    Could Zika Reach New York City?
         [Miami Beach, Miami]                  First Case of Zika in Miami Beach
              [San, San Juan]  San Juan reports 1st U.S. Zika-related death a...
   [Los Ángeles, Los Angeles]               New Los Angeles Hairstyle goes Viral
                 [Bay, Tampa]              Tampa Bay Area Zika Case Count Climbs
       [Ho, Ho Chi Minh City]     Zika cases in Vietnam's Ho Chi Minh City surge
             [San Diego, San]           Key Zika Findings in San Diego Institute
          [Kuala Lumpur, Hīt]                 Kuala Lumpur is Hit By Zika Threat
         [San Francisco, San]                   Zika Virus Reaches San Francisco
[San Salvador, Salvador, San]                       Zika worries in San Salvador


In [18]:
def get_longest_city(cities):
    if cities:
        return max(cities, key=len)
    return None

df['City'] = df['Cities'].apply(get_longest_city)

In [19]:
short_cities = df[df.City.str.len() <= 4][['City', 'Headline']]
print(short_cities.to_string(index=False))

City                                           Headline
Lima                Lima tries to address Zika Concerns
Pune                     Pune woman diagnosed with Zika
Rome  Authorities are Worried about the Spread of Ma...
Molo                Molo Cholera Spread Causing Concern
Miri                               Zika arrives in Miri
Nadi  More people in Nadi are infected with HIV ever...
Baud  Rumors about Tuberculosis Spreading in Baud ha...
Kobe                     Chikungunya re-emerges in Kobe
Waco                More Zika patients reported in Waco
Erie                        Erie County sets Zika traps
Kent                       Kent is infested with Rabies
Reno  The Spread of Gonorrhea in Reno has been Confi...
Sibu                      Zika symptoms spotted in Sibu
Baku    The Spread of Herpes in Baku has been Confirmed
Bonn  Contaminated Meat Brings Trouble for Bonn Farmers
Jaén                         Zika Troubles come to Jaen
Yuma                       Zika seminars in Yuma