In [3]:
import numpy as np
import pandas as pd

In [25]:
BASE_PATH = "../data/"
MOUNT_PATH = BASE_PATH + "mnt/ada/newspapers/"

CLUSTER_PATH = BASE_PATH + "clusters.csv.bz2"
JOURNAL_SEMANTIC_PATH = MOUNT_PATH + "journals-semantic.csv.bz2"
JOURNAL_ATTRS_PATH = MOUNT_PATH + "journal_attributes.json"
CLUSTER_LOCATIONS = MOUNT_PATH + "cluster_locations.csv.bz2"

In [70]:
clusters = pd.read_csv(CLUSTER_PATH, index_col='journal', names=['journal', 'cluster'], header=1)
clusters.head()

Unnamed: 0_level_0,cluster
journal,Unnamed: 1_level_1
1070thefan.com,1
107jamz.com,0
10news.com,0
1130thetiger.com,0
13abc.com,0


In [7]:
journal_semantics = pd.read_csv(JOURNAL_SEMANTIC_PATH)
journal_semantics.head()

Unnamed: 0.1,Unnamed: 0,money,health,government,leisure,technology,cinema_television,tourism,journalism,banking,...,science,work,fun,traveling,fashion,economics,politics,philosophy,law,feminine
0,1011now.com,0.005275,0.004946,0.005605,0.001649,0.0,0.000495,0.000989,0.000659,0.003132,...,0.001484,0.008572,0.000659,0.005605,0.000659,0.004451,0.004121,0.000495,0.004616,0.00033
1,1070thefan.com,0.001892,0.001576,0.000315,0.000946,0.0,0.0,0.000631,0.000631,0.0,...,0.000946,0.004414,0.002522,0.002837,0.0,0.001576,0.000631,0.000315,0.0,0.000631
2,107jamz.com,0.00217,0.001184,0.001184,0.002762,0.0,0.000987,0.000592,0.000789,0.002368,...,0.000197,0.002762,0.00296,0.00217,0.001184,0.00217,0.001776,0.000197,0.001579,0.001381
3,10news.com,0.004169,0.004796,0.004279,0.001881,0.000406,0.000443,0.000996,0.000738,0.002619,...,0.001402,0.004427,0.001291,0.003136,0.000848,0.004316,0.004796,0.000406,0.003726,0.000516
4,1130thetiger.com,0.000592,0.00296,0.001184,0.003552,0.0,0.002368,0.0,0.000592,0.001184,...,0.002368,0.004144,0.004737,0.003552,0.000592,0.000592,0.004144,0.0,0.001776,0.0


In [8]:
journal_attrs = pd.read_json(JOURNAL_ATTRS_PATH)
journal_attrs.index.name = 'journal'
locations = pd.DataFrame(journal_attrs['is located in'].to_list(), index=journal_attrs.index)
first_loc = locations[locations[0].notna()][0]

first_loc.head()

journal
1070thefan.com     Q1415
107jamz.com        Q1588
3aw.com.au        Q36687
4bc.com.au        Q36074
570news.com        Q1904
Name: 0, dtype: object

In [115]:
from wiki_helpers import Property
from IPython.display import display, clear_output

_itercount = 0

def find_lat_lon(qid, dataset=first_loc):
    global _itercount
    _itercount += 1
    
    clear_output(wait=True)
    display(f"Iteration: {_itercount}/{len(dataset)}")

    coords_prop = Property("P625", "coordinates", 
                           extractor=lambda x: [x['latitude'], x['longitude']])

    data = coords_prop.find_online(qid)
    return data[0] if data else [np.nan, np.nan] 

In [None]:
loc_tuples = first_loc.apply(lambda qid: find_lat_lon(qid))
journal_lat_long = pd.DataFrame(loc_tuples.to_list(), index=first_loc.index, columns=['lat', 'lon'])
journal_lat_long.head()

In [91]:
journal_lat_long.dropna(axis=0, inplace=True)
journal_lat_long

Unnamed: 0_level_0,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
1070thefan.com,39.933333,-86.216667
107jamz.com,31.000000,-92.000000
3aw.com.au,-37.000000,144.000000
4bc.com.au,-20.000000,143.000000
570news.com,50.000000,-85.000000
...,...,...
xinhuanet.com,39.904030,116.407526
yakimaherald.com,46.601944,-120.507778
yale.edu,41.600000,-72.700000
yallpolitics.com,33.000000,-90.000000


In [92]:
clusters_lat_long = clusters.join(journal_lat_long, how='left')
clusters_lat_long.to_csv(CLUSTER_LOCATIONS)
clusters_lat_long

Unnamed: 0_level_0,cluster,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1070thefan.com,1,39.933333,-86.216667
107jamz.com,0,31.000000,-92.000000
10news.com,0,,
1130thetiger.com,0,,
13abc.com,0,,
...,...,...,...
zalebs.com,-1,,
zawya.com,0,,
zdnet.com,-1,,
zeibiz.com,2,,


## Add country information

In [98]:
countries1 = pd.DataFrame(journal_attrs['country'].to_list(), index=journal_attrs.index)[0]
countries2 = pd.DataFrame(journal_attrs['country of origin'].to_list(), index=journal_attrs.index)[0]

country_or_no_origin = ~(countries1.isna() & countries2.notna())
countries = countries1.where(country_or_no_origin, other=countries2)
countries.head()

journal
1011now.com          Q30
1070thefan.com       Q30
107jamz.com          Q30
10news.com           Q30
1130thetiger.com    None
Name: 0, dtype: object

In [99]:
cluster_country = clusters_lat_long.copy()
cluster_country['country'] = countries
cluster_country

Unnamed: 0_level_0,cluster,lat,lon,country
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1070thefan.com,1,39.933333,-86.216667,Q30
107jamz.com,0,31.000000,-92.000000,Q30
10news.com,0,,,Q30
1130thetiger.com,0,,,
13abc.com,0,,,Q30
...,...,...,...,...
zalebs.com,-1,,,
zawya.com,0,,,
zdnet.com,-1,,,Q30
zeibiz.com,2,,,


In [116]:
unique_countries = pd.Series(cluster_country.country.dropna().unique())
unique_countries.index = unique_countries

_itercount = 0

country_loc_tuples = unique_countries.apply(lambda country: find_lat_lon(country, dataset=unique_countries))

'Iteration: 83/83'

In [122]:
country_lat_long = pd.DataFrame(country_loc_tuples.to_list(), index=country_loc_tuples.index, columns=['lat', 'lon'])
country_lat_long

Unnamed: 0,lat,lon
Q30,39.828175,-98.579500
Q408,-28.000000,137.000000
Q16,56.000000,-109.000000
Q801,31.000000,35.000000
Q843,30.000000,71.000000
...,...,...
Q55,52.316667,5.550000
Q233,35.883333,14.500000
Q754,10.666667,-61.516667
Q193619,-29.000000,24.000000


In [150]:
cluster_lat_country = cluster_country.merge(right=country_lat_long, left_on='country', right_index=True, how='left', suffixes=["", "_country"])
cluster_lat_country.head()

Unnamed: 0_level_0,cluster,lat,lon,country,lat_country,lon_country
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1070thefan.com,1,39.933333,-86.216667,Q30,39.828175,-98.5795
107jamz.com,0,31.0,-92.0,Q30,39.828175,-98.5795
10news.com,0,,,Q30,39.828175,-98.5795
1130thetiger.com,0,,,,,
13abc.com,0,,,Q30,39.828175,-98.5795


In [151]:
has_lat_lon = ((cluster_lat_country.lat.notna() & cluster_lat_country.lon.notna()) \
               | (cluster_lat_country.lat_country.isna() | cluster_lat_country.lon_country.isna()))

cluster_lat_country['lat'] = cluster_lat_country.lat.where(has_lat_lon, other=cluster_lat_country.lat_country)
cluster_lat_country['lon'] = cluster_lat_country.lon.where(has_lat_lon, other=cluster_lat_country.lon_country)
cluster_lat_country.drop(columns=["country", "lat_country", "lon_country"], inplace=True)

cluster_lat_country.to_csv(CLUSTER_LOCATIONS)
cluster_lat_country

Unnamed: 0_level_0,cluster,lat,lon
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1070thefan.com,1,39.933333,-86.216667
107jamz.com,0,31.000000,-92.000000
10news.com,0,39.828175,-98.579500
1130thetiger.com,0,,
13abc.com,0,39.828175,-98.579500
...,...,...,...
zalebs.com,-1,,
zawya.com,0,,
zdnet.com,-1,39.828175,-98.579500
zeibiz.com,2,,
