In [2]:
import geonamescache
import pandas as pd
import numpy as np
import re
from unidecode import unidecode

gc = geonamescache.GeonamesCache()

In [3]:
###
# Countries
###
countries = [country["name"] for country in gc.get_countries().values()]
country_accent_mapping = {
    unidecode(country): country for country in countries
}
unaccented_countries = set(country_accent_mapping.keys())
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
countries_re = r"\b" + r"\b|\b".join(unaccented_countries) + r"\b"
comp_country_re = re.compile(countries_re)

###
# Cities
###

cities = [city["name"] for city in gc.get_cities().values()]
city_accent_mapping = {
    unidecode(city): city for city in cities
}
unaccented_cities = list(city_accent_mapping.keys())
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)
city_re = r"\b" + r"\b|\b".join(unaccented_cities) + r"\b"
comp_city_re = re.compile(city_re)



In [4]:
###
# Get file data
###
data = []
file = open("data/headlines.txt", 'r')
for line in file.readlines():
    data.append(unidecode(line.strip()))

file.close()

In [5]:
# Dataframe columns
df_headline = []
df_country = []
df_city = []

for line in data:
    df_headline.append(line)
    country_result = comp_country_re.search(line)
    if country_result is not None:
        df_country.append(country_result.group(0))
    else:
        df_country.append(np.NaN)
    
    city_result = comp_city_re.search(line)
    if city_result is not None:
        df_city.append(city_result.group(0))
    else:
        df_city.append(np.NaN)
    
df = pd.DataFrame({'headline': df_headline, 'country': df_country, 'city': df_city})

df.to_csv("data/processedheadlines.csv")

print(df)



                                              headline country           city
0                             Zika Outbreak Hits Miami     NaN          Miami
1                      Could Zika Reach New York City?     NaN  New York City
2                    First Case of Zika in Miami Beach     NaN    Miami Beach
3              Mystery Virus Spreads in Recife, Brazil  Brazil         Recife
4              Dallas man comes down with case of Zika     NaN         Dallas
..                                                 ...     ...            ...
645  Rumors about Rabies spreading in Jerusalem hav...     NaN      Jerusalem
646              More Zika patients reported in Indang     NaN         Indang
647  Suva authorities confirmed the spread of Rotav...     NaN           Suva
648         More Zika patients reported in Bella Vista     NaN    Bella Vista
649                     Zika Outbreak in Wichita Falls     NaN  Wichita Falls

[650 rows x 3 columns]


In [7]:
new_df = pd.read_csv("data/processedheadlines.csv")

print(new_df)

     Unnamed: 0                                           headline country  \
0             0                           Zika Outbreak Hits Miami     NaN   
1             1                    Could Zika Reach New York City?     NaN   
2             2                  First Case of Zika in Miami Beach     NaN   
3             3            Mystery Virus Spreads in Recife, Brazil  Brazil   
4             4            Dallas man comes down with case of Zika     NaN   
..          ...                                                ...     ...   
645         645  Rumors about Rabies spreading in Jerusalem hav...     NaN   
646         646              More Zika patients reported in Indang     NaN   
647         647  Suva authorities confirmed the spread of Rotav...     NaN   
648         648         More Zika patients reported in Bella Vista     NaN   
649         649                     Zika Outbreak in Wichita Falls     NaN   

              city  
0            Miami  
1    New York City  
