# Initialization

## Import libraries

In [1]:
import requests
import os
import json
import re
import torch
import pandas as pd
import numpy as np
import http.client
import urllib.parse

## Set Variables

In [2]:
use_colab = False
positionstack_api_key = 'a185ca6660614f712266ea5e80e00e06'
positionstack_api = 'http://api.positionstack.com'
countriesnow_api = 'https://countriesnow.space'
wikipedia_api = 'https://en.wikipedia.org/w/api.php?'

if (use_colab):
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_directory = '/content/drive/MyDrive/Colab/datasets/wikipedia_to_sdgs'
    workspace = '/content/drive/MyDrive/Colab/wikipedia_to_sdgs'
else:
    workspace = '.'

sdgs_corpus_titles_path = os.path.join(workspace, 'sdgs_titles.json')
countries_file_path = os.path.join(workspace, 'countries.txt')
country_articles_file_path = os.path.join(workspace, 'country_articles.json')
corpus_directory = os.path.join(workspace, 'articles')

# Construct the corpus

## Get country cities using https://countriesnow.space

In [3]:
def retrieve_citites(country):
    cities = []

    headers = {"Content-Type": "application/json; charset=utf-8"}
    body = {
        "country": country,
    }
    response = requests.post(countriesnow_api+'/api/v0.1/countries/cities', headers=headers, json=body)
    data = response.json()['data']
    for index in range(len(data)):
        city = data[index]
        cities.append(city)

    return cities

## Get country geo-location
Use https://positionstack.com/ to retrieve the geolocation of a country

In [4]:
def retrieve_country_geolocation(country):
    params = urllib.parse.urlencode({
    'access_key': positionstack_api_key,
    'query':country,
    })
    response = requests.get(positionstack_api+'/v1/forward?', params=params)
    data = response.json()['data']
    for index in range(len(data)):
        if data[index]['type']=='country':
            latitude = data[index]['latitude']
            longitude = data[index]['longitude']
            break
    return latitude,longitude

In [5]:
def retrieve_city_geolocation(country, city):

    params = urllib.parse.urlencode({
    'access_key': positionstack_api_key,
    'query':city+', '+country,
    })
    response = requests.get(positionstack_api+'/v1/forward?', params=params)
    data = response.json()['data']
    if len(data)>0:
        latitude = data[0]['latitude']
        longitude = data[0]['longitude']
    else:
        latitude=None
        longitude=None
        
    return latitude,longitude

## Get articles near geolocation

In [6]:
def retrieve_nearby_articles(latitude, longitude):
    page_titles = []

    request= wikipedia_api+'action=query&list=geosearch&gscoord='+str(latitude)+'|'+str(longitude)+'&gsradius=10000&gslimit=max&format=json'
    response = requests.get(request).json()
    pages = list(response['query']['geosearch'])
    for page_index in range(len(pages)):
        page = pages[page_index]
        title = page['title']
        page_titles.append(title)

    return page_titles

## Read the countries

In [7]:

# opening the file in read mode
countries_file = open(countries_file_path, "r")
# reading the file
countries = countries_file.readlines()

for country_index in range(len(countries)):
    countries[country_index]=countries[country_index].replace('\n', "")
print('Number of found countries: ', len(countries), countries)

Number of found countries:  7 ['Algeria', 'Egypt', 'Morocco', 'Tunisia', 'Libya', 'Sudan', 'Mauritania']


## Contruct the countries dictionariy

In [8]:
country_articles_dict = dict()
for country in countries:
    country_articles = set()
    cities = retrieve_citites(country)
    for city in cities:
        try:
            print('Retrieve geolocation for ',country,', ', city)
            latitude,longitude = retrieve_city_geolocation(country, city)
            print('Retrieve articles for ',country,', ', city)
            city_articles = retrieve_nearby_articles(latitude, longitude)
            country_articles.update(city_articles)
        except:
          print("Erro while retrieving location of data for ", country,", ", city)
    country_articles_dict[country]=list(country_articles)

Retrieve geolocation for  Algeria ,  Algiers
Retrieve articles for  Algeria ,  Algiers
Retrieve geolocation for  Algeria ,  Annaba
Retrieve articles for  Algeria ,  Annaba
Retrieve geolocation for  Algeria ,  Azazga
Retrieve articles for  Algeria ,  Azazga
Retrieve geolocation for  Algeria ,  Batna City
Retrieve articles for  Algeria ,  Batna City
Retrieve geolocation for  Algeria ,  Blida
Retrieve articles for  Algeria ,  Blida
Retrieve geolocation for  Algeria ,  Bordj
Retrieve articles for  Algeria ,  Bordj
Retrieve geolocation for  Algeria ,  Bordj Bou Arreridj
Retrieve articles for  Algeria ,  Bordj Bou Arreridj
Retrieve geolocation for  Algeria ,  Bougara
Retrieve articles for  Algeria ,  Bougara
Retrieve geolocation for  Algeria ,  Cheraga
Retrieve articles for  Algeria ,  Cheraga
Retrieve geolocation for  Algeria ,  Chlef
Retrieve articles for  Algeria ,  Chlef
Retrieve geolocation for  Algeria ,  Constantine
Retrieve articles for  Algeria ,  Constantine
Retrieve geolocation fo

## Dump articles dictionary to file

In [9]:
with open(country_articles_file_path, 'w') as file:
    json.dump(country_articles_dict, file)

# Download the corpus articles

## Get Wikipedia article content

In [10]:
def retrieve_wikipedia_article(article_uri):
    if article_uri.startswith('http'):
        article_title = article_uri.rsplit('/', 1)[-1]
        print('Title = ', article_title)
    else:
        article_title = article_uri
    encoded_article_title = urllib.parse.quote(article_title.encode('utf8'))
    request= 'https://en.wikipedia.org/w/api.php?format=json&action=query&titles='+encoded_article_title+'&prop=extracts&exlimit=max&explaintext&exlimit=max'
    response = requests.get(request).json()
    pageid = list(response['query']['pages'])[0]
    content = response['query']['pages'][pageid]['extract']

    return pageid, content

## Read the country-articles dictionary file

In [11]:
with open(country_articles_file_path, 'r') as file:
    country_articles_dict = json.load(file)

## Retrieve articles then dump then into files

In [None]:
extended_country_articles_dict = dict()

for country in country_articles_dict.keys():
    article_dicts = []
    articles = country_articles_dict[country]
    country_directory = os.path.join(corpus_directory, country)
    if not os.path.exists(country_directory):
        os.makedirs(country_directory)
    else:
      continue
    # loop over articles #
    for article in articles:
      article_dict = dict()
      print('Start dumping file for article ', article)
      pageid, content = retrieve_wikipedia_article(article)
      article_file_name = str(pageid)+'.txt'
      article_file_path = os.path.join(country_directory, article_file_name)
      with open(article_file_path, 'w', encoding='utf-8') as file:
          file.write(content)
          file.close()
      article_dict['title'] = article
      article_dict['path'] = os.path.join(country,article_file_name)
      article_dicts.append(article_dict)
    extended_country_articles_dict[country]=article_dicts
        

Start dumping file for article  Béni Yenni District
Start dumping file for article  El Amria
Start dumping file for article  Aït Bouaddou
Start dumping file for article  Ouled Si Slimane District
Start dumping file for article  Guelma Province
Start dumping file for article  Metlili
Start dumping file for article  Robbah
Start dumping file for article  Dar Hassan Pacha
Start dumping file for article  Bir Ould Khelifa
Start dumping file for article  Hammam Bou Hadjar District
Start dumping file for article  University of Laghouat
Start dumping file for article  Remchi
Start dumping file for article  Gharbia, Algeria
Start dumping file for article  Chlef District
Start dumping file for article  Aïn Séfra
Start dumping file for article  Semaoune
Start dumping file for article  Batna (city)
Start dumping file for article  Ouled Salah, Boumerdès
Start dumping file for article  Yahia Beniguecha
Start dumping file for article  Boudjellil
Start dumping file for article  Sidi Abdelli
Start dump

## Dump the extended articles dictionary to file

In [None]:
with open(country_articles_file_path, 'w') as file:
    json.dump(extended_country_articles_dict, file)