# ADA Project - Journal location extraction

The aim of this notebook is to, given the enhanced journal attributes dataset, find the latitude and longitude of as many journals as possible

## Setup

In [3]:
import numpy as np
import pandas as pd

In [4]:
BASE_PATH = "../../data/"
MOUNT_PATH = BASE_PATH + "mnt/ada/newspapers/"

CLUSTER_PATH = BASE_PATH + "clusters.csv.bz2"
JOURNAL_SEMANTIC_PATH = MOUNT_PATH + "journals-semantic.csv.bz2"
JOURNAL_ATTRS_PATH = MOUNT_PATH + "journal_attributes.json"
JOURNAL_LOCATIONS = MOUNT_PATH + "journal_locations.csv"
COUNTRY_LOCATIONS = MOUNT_PATH + "country_locations.csv"
CLUSTER_LOCATIONS = MOUNT_PATH + "cluster_locations.csv.bz2"

We read the cluster dataset, which assigns a cluster ID to each journal, with $-1$ being unclustered.

In [5]:
clusters = pd.read_csv(CLUSTER_PATH, index_col='journal', usecols=['journal', 'cluster_id'])
clusters.rename(columns={'cluster_id': 'cluster'})
clusters.head()

Unnamed: 0_level_0,cluster_id
journal,Unnamed: 1_level_1
1011now.com,-1.0
1070thefan.com,-1.0
107jamz.com,6.0
10news.com,14.0
1130thetiger.com,-1.0


We also read the first value of each entry in the `is_located_in` column of the journal attributes. This is a slight limitation of our method since it might not be the _main_ location of the journal.

In [19]:
journal_attrs = pd.read_json(JOURNAL_ATTRS_PATH)
journal_attrs.index.name = 'journal'
locations = pd.DataFrame(journal_attrs['is located in'].to_list(), index=journal_attrs.index)
first_loc = locations[locations[0].notna()]

locations = pd.melt(first_loc, ignore_index=False)[["value"]].dropna()
locations

Unnamed: 0_level_0,value
journal,Unnamed: 1_level_1
1070thefan.com,Q1415
107jamz.com,Q1588
3aw.com.au,Q36687
4bc.com.au,Q36074
570news.com,Q1904
...,...
wordpress.com,Q456556
wordpress.com,Q10313
wordpress.com,Q12411
wordpress.com,Q779


In [93]:
from wiki_helpers import Property
from IPython.display import display, clear_output

_itercount = 0

def find_lat_lon(qid, dataset=first_loc):
    """Given a qid, returns its latitude and longitude"""
    # Print progress to terminal
    global _itercount
    _itercount += 1
        
    clear_output(wait=True)
    display(f"Iteration: {_itercount}/{len(dataset)}")

    # Find coordinates
    coords_prop = Property("P625", "coordinates", 
                           extractor=lambda x: [x['latitude'], x['longitude']])

    data = coords_prop.find_online(qid)
    return data[0] if data else [np.nan, np.nan] 

We find a list of all unique locations in the data, and find their coordinates

In [94]:
unique_locs = pd.Series(data=locations.value.unique(), index=locations.value.unique(), dtype=object)
unique_locs

Q1415        Q1415
Q1588        Q1588
Q36687      Q36687
Q36074      Q36074
Q1904        Q1904
            ...   
Q1250        Q1250
Q14904      Q14904
Q456556    Q456556
Q10313      Q10313
Q12411      Q12411
Length: 1805, dtype: object

We query wikidata for the location

In [95]:
loc_tuples = unique_locs.apply(lambda qid: find_lat_lon(qid, dataset=unique_locs))
journal_lat_long = pd.DataFrame(loc_tuples.to_list(), index=unique_locs.index, columns=['lat', 'lon'])
journal_lat_long.head()

'Iteration: 1805/1805'

Unnamed: 0,lat,lon
Q1415,39.933333,-86.216667
Q1588,31.0,-92.0
Q36687,-37.0,144.0
Q36074,-20.0,143.0
Q1904,50.0,-85.0


In [96]:
journal_lat_long.dropna(axis=0, inplace=True)
journal_lat_long.index.name = "qid"
journal_lat_long.head()

Unnamed: 0_level_0,lat,lon
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1415,39.933333,-86.216667
Q1588,31.0,-92.0
Q36687,-37.0,144.0
Q36074,-20.0,143.0
Q1904,50.0,-85.0


We then merge the journal locations and the coordinates dataset. We then group by journal to find the best location among the ones assigned to that journal.

In [108]:
from distances import get_closest

all_locations = locations.merge(journal_lat_long, left_on="value", right_index=True)
all_locations = all_locations.reset_index()[['journal', 'lat', 'lon']]

def group_to_mean_coords(group):
    if len(list(zip(group.lat, group.lon))) < 1:
        display(group)
    
    return get_closest(list(zip(group.lat, group.lon)))
    
mean_locations = all_locations.groupby('journal').apply(group_to_mean_coords)
mean_locations = pd.DataFrame(mean_locations.to_list(), index= mean_locations.index, columns=['lat', 'lon'])

We save the dataset where we discard invalid latitudes and longitudes

In [109]:
mean_locations.to_csv(JOURNAL_LOCATIONS)
mean_locations

Unnamed: 0_level_0,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
1070thefan.com,39.933333,-86.216667
107jamz.com,31.000000,-92.000000
3aw.com.au,-37.000000,144.000000
4bc.com.au,-20.000000,143.000000
570news.com,50.000000,-85.000000
...,...,...
xinhuanet.com,31.200000,112.300000
yakimaherald.com,46.601944,-120.507778
yale.edu,41.600000,-72.700000
yallpolitics.com,33.000000,-90.000000


We combine the local. data with the clusters to form a URL -> (cluster_id, lat, lon) dataset. This will be used on our interactive map

In [111]:
clusters_lat_long = clusters.join(mean_locations, how='left')
clusters_lat_long.to_csv(CLUSTER_LOCATIONS)
clusters_lat_long

Unnamed: 0_level_0,cluster_id,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1011now.com,-1.0,,
1070thefan.com,-1.0,39.933333,-86.216667
107jamz.com,6.0,31.000000,-92.000000
10news.com,14.0,,
1130thetiger.com,-1.0,,
...,...,...,...
zalebs.com,-1.0,,
zawya.com,14.0,,
zdnet.com,-1.0,,
zeibiz.com,6.0,,


## Add country information

For journals that do not have a precise location but do mention a country of origin, we add that country's wikipedia localization.

In [121]:
countries1 = pd.DataFrame(journal_attrs['country'].to_list(), index=journal_attrs.index)
countries2 = pd.DataFrame(journal_attrs['country of origin'].to_list(), index=journal_attrs.index)

country_or_no_origin = ~(countries1.isna().all(axis=1) & countries2.notna().any(axis=1))
countries = countries1.where(country_or_no_origin, other=countries2)
countries = pd.melt(countries, ignore_index=False)[["value"]]
countries

Unnamed: 0_level_0,value
journal,Unnamed: 1_level_1
1011now.com,Q30
1070thefan.com,Q30
107jamz.com,Q30
10news.com,Q30
1130thetiger.com,
...,...
zalebs.com,
zawya.com,
zdnet.com,
zeibiz.com,


We query wikipedia for the unique locations of every country

In [134]:
unique_countries = pd.Series(countries.value.dropna().unique())
unique_countries.index = unique_countries

_itercount = 0

country_loc_tuples = unique_countries.apply(lambda country: find_lat_lon(country, dataset=unique_countries))

'Iteration: 207/207'

In [138]:
unique_country_loc_df = pd.DataFrame(country_loc_tuples.to_list(), index=unique_countries.index, columns=['lat', 'lon'])
unique_country_loc_df.head()

Unnamed: 0,lat,lon
Q30,39.828175,-98.5795
Q408,-28.0,137.0
Q16,56.0,-109.0
Q801,31.0,35.0
Q843,30.0,71.0


We left-join the journal and the country coordinates on the country.
We then group by journal and find the best location for that journal.

In [157]:
country_lat_lon = countries.merge(unique_country_loc_df, left_on='value', right_index=True)
mean_country_coords = country_lat_lon.groupby('journal').apply(group_to_mean_coords)
mean_country_coords = pd.DataFrame(mean_country_coords.to_list(), index= mean_country_coords.index, columns=['lat', 'lon'])
mean_country_coords

Unnamed: 0_level_0,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
1011now.com,39.828175,-98.5795
1070thefan.com,39.828175,-98.5795
107jamz.com,39.828175,-98.5795
10news.com,39.828175,-98.5795
13abc.com,39.828175,-98.5795
...,...,...
yourdailyglobe.com,39.828175,-98.5795
yourhoustonnews.com,39.828175,-98.5795
yukon-news.com,56.000000,-109.0000
yumasun.com,39.828175,-98.5795


We merge the original and the country localizations on the journals

In [161]:
cluster_lat_country = clusters_lat_long.merge(right=mean_country_coords, left_index=True, right_index=True, 
                                            how='left', suffixes=["", "_country"])
cluster_lat_country.head()

Unnamed: 0_level_0,cluster_id,lat,lon,lat_country,lon_country
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1011now.com,-1.0,,,39.828175,-98.5795
1070thefan.com,-1.0,39.933333,-86.216667,39.828175,-98.5795
107jamz.com,6.0,31.0,-92.0,39.828175,-98.5795
10news.com,14.0,,,39.828175,-98.5795
1130thetiger.com,-1.0,,,,


We finally combine the columns s.t. if there is a precise location, then that is the value, otherwise use the country localization.

In [162]:
has_lat_lon = ((cluster_lat_country.lat.notna() & cluster_lat_country.lon.notna()) \
               | (cluster_lat_country.lat_country.isna() | cluster_lat_country.lon_country.isna()))

cluster_lat_country['lat'] = cluster_lat_country.lat.where(has_lat_lon, other=cluster_lat_country.lat_country)
cluster_lat_country['lon'] = cluster_lat_country.lon.where(has_lat_lon, other=cluster_lat_country.lon_country)
cluster_lat_country.drop(columns=["lat_country", "lon_country"], inplace=True)

cluster_lat_country.to_csv(CLUSTER_LOCATIONS)
cluster_lat_country

Unnamed: 0_level_0,cluster_id,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1011now.com,-1.0,39.828175,-98.579500
1070thefan.com,-1.0,39.933333,-86.216667
107jamz.com,6.0,31.000000,-92.000000
10news.com,14.0,39.828175,-98.579500
1130thetiger.com,-1.0,,
...,...,...,...
zalebs.com,-1.0,,
zawya.com,14.0,,
zdnet.com,-1.0,39.828175,-98.579500
zeibiz.com,6.0,,
