# ADA Project - Journal location extraction

The aim of this notebook is to, given the enhanced journal attributes dataset, find the latitude and longitude of as many journals as possible

## Setup

In [1]:
import numpy as np
import pandas as pd

In [2]:
BASE_PATH = "../data/"
MOUNT_PATH = BASE_PATH + "mnt/ada/newspapers/"

CLUSTER_PATH = BASE_PATH + "clusters.csv.bz2"
JOURNAL_SEMANTIC_PATH = MOUNT_PATH + "journals-semantic.csv.bz2"
JOURNAL_ATTRS_PATH = MOUNT_PATH + "journal_attributes.json"
JOURNAL_LOCATIONS = MOUNT_PATH + "journal_locations.csv"
COUNTRY_LOCATIONS = MOUNT_PATH + "country_locations.csv"
CLUSTER_LOCATIONS = MOUNT_PATH + "cluster_locations.csv.bz2"

We read the cluster dataset, which assigns a cluster ID to each journal, with $-1$ being unclustered.

In [3]:
clusters = pd.read_csv(CLUSTER_PATH, index_col='journal', usecols=['journal', 'cluster_id'])
clusters.rename(columns={'cluster_id': 'cluster'})
clusters.head()

Unnamed: 0_level_0,cluster_id
journal,Unnamed: 1_level_1
1011now.com,-1.0
1070thefan.com,-1.0
107jamz.com,6.0
10news.com,14.0
1130thetiger.com,-1.0


We also read the first value of each entry in the `is_located_in` column of the journal attributes. This is a slight limitation of our method since it might not be the _main_ location of the journal.

In [4]:
journal_attrs = pd.read_json(JOURNAL_ATTRS_PATH)
journal_attrs.index.name = 'journal'
locations = pd.DataFrame(journal_attrs['is located in'].to_list(), index=journal_attrs.index)
first_loc = locations[locations[0].notna()][0]

first_loc.head()

journal
1070thefan.com     Q1415
107jamz.com        Q1588
3aw.com.au        Q36687
4bc.com.au        Q36074
570news.com        Q1904
Name: 0, dtype: object

In [17]:
from wiki_helpers import Property
from IPython.display import display, clear_output

_itercount = 0

def find_lat_lon(qid, dataset=first_loc):
    """Given a qid, returns its latitude and longitude"""
    # Print progress to terminal
    global _itercount
    _itercount += 1
    
    clear_output(wait=True)
    display(f"Iteration: {_itercount}/{len(dataset)}")

    # Find coordinates
    coords_prop = Property("P625", "coordinates", 
                           extractor=lambda x: [x['latitude'], x['longitude']])

    data = coords_prop.find_online(qid)
    return data[0] if data else [np.nan, np.nan] 

We extract localization data from wikipedia

In [16]:
loc_tuples = first_loc.apply(lambda qid: find_lat_lon(qid))
journal_lat_long = pd.DataFrame(loc_tuples.to_list(), index=first_loc.index, columns=['lat', 'lon'])
journal_lat_long.head()

'Iteration: 858/858'

Unnamed: 0_level_0,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
1070thefan.com,39.933333,-86.216667
107jamz.com,31.0,-92.0
3aw.com.au,-37.0,144.0
4bc.com.au,-20.0,143.0
570news.com,50.0,-85.0


We save the dataset where we discard invalid latitudes and longitudes

In [18]:
journal_lat_long.dropna(axis=0, inplace=True)
journal_lat_long.to_csv(JOURNAL_LOCATIONS)
journal_lat_long

Unnamed: 0_level_0,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
1070thefan.com,39.933333,-86.216667
107jamz.com,31.000000,-92.000000
3aw.com.au,-37.000000,144.000000
4bc.com.au,-20.000000,143.000000
570news.com,50.000000,-85.000000
...,...,...
xinhuanet.com,39.904030,116.407526
yakimaherald.com,46.601944,-120.507778
yale.edu,41.600000,-72.700000
yallpolitics.com,33.000000,-90.000000


We combine the local. data with the clusters to form a URL -> (cluster_id, lat, lon) dataset. This will be used on our interactive map

In [13]:
clusters_lat_long = clusters.join(journal_lat_long, how='left')
clusters_lat_long.to_csv(CLUSTER_LOCATIONS)
clusters_lat_long

Unnamed: 0_level_0,cluster_id,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1011now.com,-1.0,,
1070thefan.com,-1.0,39.933333,-86.216667
107jamz.com,6.0,31.000000,-92.000000
10news.com,14.0,,
1130thetiger.com,-1.0,,
...,...,...,...
zalebs.com,-1.0,,
zawya.com,14.0,,
zdnet.com,-1.0,,
zeibiz.com,6.0,,


## Add country information

For journals that do not have a precise location but do mention a country of origin, we add that country's wikipedia localization.

In [14]:
countries1 = pd.DataFrame(journal_attrs['country'].to_list(), index=journal_attrs.index)[0]
countries2 = pd.DataFrame(journal_attrs['country of origin'].to_list(), index=journal_attrs.index)[0]

country_or_no_origin = ~(countries1.isna() & countries2.notna())
countries = countries1.where(country_or_no_origin, other=countries2)
countries.head()

journal
1011now.com          Q30
1070thefan.com       Q30
107jamz.com          Q30
10news.com           Q30
1130thetiger.com    None
Name: 0, dtype: object

In [15]:
cluster_country = clusters_lat_long.copy()
cluster_country['country'] = countries
cluster_country

Unnamed: 0_level_0,cluster_id,lat,lon,country
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1011now.com,-1.0,,,Q30
1070thefan.com,-1.0,39.933333,-86.216667,Q30
107jamz.com,6.0,31.000000,-92.000000,Q30
10news.com,14.0,,,Q30
1130thetiger.com,-1.0,,,
...,...,...,...,...
zalebs.com,-1.0,,,
zawya.com,14.0,,,
zdnet.com,-1.0,,,Q30
zeibiz.com,6.0,,,


Extract the coordinates for each country from wikipedia

In [18]:
unique_countries = pd.Series(cluster_country.country.dropna().unique())
unique_countries.index = unique_countries

_itercount = 0

country_loc_tuples = unique_countries.apply(lambda country: find_lat_lon(country, dataset=unique_countries))

'Iteration: 84/84'

In [19]:
country_lat_long = pd.DataFrame(country_loc_tuples.to_list(), index=country_loc_tuples.index, columns=['lat', 'lon'])
country_lat_long

Unnamed: 0,lat,lon
Q30,39.828175,-98.579500
Q408,-28.000000,137.000000
Q16,56.000000,-109.000000
Q801,31.000000,35.000000
Q843,30.000000,71.000000
...,...,...
Q55,52.316667,5.550000
Q233,35.883333,14.500000
Q754,10.666667,-61.516667
Q193619,-29.000000,24.000000


We merge the original and the country localizations on the country column

In [20]:
cluster_lat_country = cluster_country.merge(right=country_lat_long, left_on='country', right_index=True, how='left', suffixes=["", "_country"])
cluster_lat_country.head()

Unnamed: 0_level_0,cluster_id,lat,lon,country,lat_country,lon_country
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1011now.com,-1.0,,,Q30,39.828175,-98.5795
1070thefan.com,-1.0,39.933333,-86.216667,Q30,39.828175,-98.5795
107jamz.com,6.0,31.0,-92.0,Q30,39.828175,-98.5795
10news.com,14.0,,,Q30,39.828175,-98.5795
1130thetiger.com,-1.0,,,,,


We finally combine the columns s.t. if there is a precise location, then that is the value, otherwise use the country localization.

In [21]:
has_lat_lon = ((cluster_lat_country.lat.notna() & cluster_lat_country.lon.notna()) \
               | (cluster_lat_country.lat_country.isna() | cluster_lat_country.lon_country.isna()))

cluster_lat_country['lat'] = cluster_lat_country.lat.where(has_lat_lon, other=cluster_lat_country.lat_country)
cluster_lat_country['lon'] = cluster_lat_country.lon.where(has_lat_lon, other=cluster_lat_country.lon_country)
cluster_lat_country.drop(columns=["country", "lat_country", "lon_country"], inplace=True)

cluster_lat_country.to_csv(CLUSTER_LOCATIONS)
cluster_lat_country

Unnamed: 0_level_0,cluster_id,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1011now.com,-1.0,39.828175,-98.579500
1070thefan.com,-1.0,39.933333,-86.216667
107jamz.com,6.0,31.000000,-92.000000
10news.com,14.0,39.828175,-98.579500
1130thetiger.com,-1.0,,
...,...,...,...
zalebs.com,-1.0,,
zawya.com,14.0,,
zdnet.com,-1.0,39.828175,-98.579500
zeibiz.com,6.0,,
