In [1]:
# Python imports
import re
from unidecode import unidecode
from geonamescache import GeonamesCache
import pandas as pd

In [2]:
data_file = open('headlines.txt','r')
news_headlines = [headline.strip() for headline in data_file.readlines()]

In [3]:
# Check number of headlines
print(f"The data file contains {len(news_headlines)} headlines")

The data file contains 650 headlines


In [4]:
#   Helper class for parsing a new headline string in order to identify whether
#   a well-known country and/or city is mentioned in the news headline string
#   The constructor requires a valid instance of the GeonamesCache module.
#
class HeadlineParser:
    def __init__(self, geoCache):
        self.geoCache = geoCache
        self.city_list = None
        self.country_list = None
     
    def get_country_name(self, headline):
        return self.__get_geoname_from(headline, self.__country_search_list())
    
    def get_city_name(self, headline):
        return self.__get_geoname_from(headline, self.__city_search_list())

    # returns a tuple with country and city name mentioned in the given headline.
    def get_geoinfo(self, headline):
        return (self.get_country_name(headline), self.get_city_name(headline))
    
    def __city_search_list(self):
        if self.city_list is None:
            self.city_list = self.__get_geoname_list(self.geoCache.get_cities().values())
        return self.city_list
    
    def __country_search_list(self):
        if self.country_list is None:
            self.country_list = self.__get_geoname_list(self.geoCache.get_countries().values())
        return self.country_list
    
    def __get_geoname_list(self, geoitems_list):
        geo_item_dict = { self.__create_location_regex(geo_item['name']) : geo_item['name'] for geo_item in geoitems_list }
        sorted_list = sorted(geo_item_dict.items(), key=lambda x: x[1])
        return sorted_list
    
    def __get_geoname_from(self, headline, geoname_search_list):
        for geoname_regex, geoname in geoname_search_list:
            if geoname_regex.search(headline):
                return geoname
        return None
    
    def __create_location_regex(self, location_name):
        name_in_ascii = unidecode(location_name)
        if location_name != name_in_ascii:
            regex = fr'\b({location_name}|{name_in_ascii})\b'
        else:
            regex = fr'\b({location_name})\b'
        return re.compile(regex, flags=re.IGNORECASE)

In [5]:
# Check HeadlineParser functionality
hl_parser = HeadlineParser(GeonamesCache())

hline1 = 'A headline containing a reference to New york city, United States'
(country1, city1) = hl_parser.get_geoinfo(hline1)
print(f"Test headline1: identified the following city and country name: {city1}/{country1}")


hline2 = 'A headline without a known city mentioned in Australia'
(country2, city2) = hl_parser.get_geoinfo(hline2)
print(f"Test headline1: identified the following city and country name: {city2}/{country2}")


hline3 = 'Some breaking news happened in Munich'
(country3, city3) = hl_parser.get_geoinfo(hline3)
print(f"Test headline1: identified the following city and country name: {city3}/{country3}")

Test headline1: identified the following city and country name: New York City/United States
Test headline1: identified the following city and country name: None/Australia
Test headline1: identified the following city and country name: Munich/None


In [6]:
# Extract country and city per headline (leaving country undefined if not mentioned so far)
country_per_headline = [hl_parser.get_country_name(headline) for headline in news_headlines]
city_per_headline = [hl_parser.get_city_name(headline) for headline in news_headlines]

In [7]:
# Create Pandas dataframe from headlines and enriched data
enriched_data = { 'headline' : news_headlines, 'city' : city_per_headline, 'country' : country_per_headline }
headlines_df = pd.DataFrame(enriched_data)

In [8]:
summary = headlines_df.describe()
print(summary)

                                    headline city   country
count                                    650  619        15
unique                                   647  510        10
top     Spanish Flu Spreading through Madrid   Of  Malaysia
freq                                       2   45         3


In [9]:
print(headlines_df)

                                              headline           city country
0                             Zika Outbreak Hits Miami          Miami    None
1                      Could Zika Reach New York City?  New York City    None
2                    First Case of Zika in Miami Beach          Miami    None
3              Mystery Virus Spreads in Recife, Brazil         Recife  Brazil
4              Dallas man comes down with case of Zika         Dallas    None
..                                                 ...            ...     ...
645  Rumors about Rabies spreading in Jerusalem hav...      Jerusalem    None
646              More Zika patients reported in Indang         Indang    None
647  Suva authorities confirmed the spread of Rotav...             Of    None
648         More Zika patients reported in Bella Vista    Bella Vista    None
649                     Zika Outbreak in Wichita Falls        Wichita    None

[650 rows x 3 columns]
