# ADA Project Milestone 2 - Wikidata dataset newspaper generation

## Setup and Remote dataset loading

We first import necessary libraries into the notebook

In [50]:
import numpy as np
import pandas as pd

import json
import bz2

In [51]:
%pip install pandas==1.3.0
%pip install tld



We mount the EPFL google drive and define access paths for the different datasets



In [52]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
#BASE_PATH = "../data/mnt/ada/"
BASE_PATH = "/content/drive/Shareddrives/Improvise ADApt Overcome/Datasets/"

SPEAKER_PATH = BASE_PATH + "speakers/"
QUOTEBANK_PATH = BASE_PATH + "quotebank/"
WIKI_PATH = BASE_PATH + "wikipedia/"
NEWS_PATH = BASE_PATH + "newspapers/"

SPEAKER_ATTRS = SPEAKER_PATH + "speaker_attributes.parquet"
WIKIPEDIA_ATTRS = SPEAKER_PATH + "wikidata_labels_descriptions.csv.bz2"
QB_WIKIPEDIA_ATTRS = SPEAKER_PATH + "wikidata_labels_descriptions_quotebank.csv.bz2"
FULL_WIKIDUMP = WIKI_PATH + "wikipedia-latest-all.json.bz2"


To classify speakers, we will use the `speaker_attributes` parquet file as is without further modification. Its size is small enough to be managable in RAM.

## Newspaper URL extraction

We need to generate a dataset linking news agency urls with their respective wikidata entry ids. This will allow us in the future to find patterns in the groups of news outlets by having more data about them. Once the wikidata id is obtained, it will relatively easy to obtain more information about those media outlets.

Such a table is much reduced in size compared to a full wikipedia dump, since entries are restricted to news organizations. This means the dataset can be easily used in RAM, indexed by url in pandas, as a lookup table for publisher identifiers. This is a parallelizable task, so quotebank can be split into managable chunks to add a link between quote id and publisher for each quote. 

In [None]:
import sys
from tqdm import tqdm
from tld import get_fld

def is_media_company(s):
  """Takes a JSON wikidata entry and returns True if that entry is an
  instance of media company, False otherwise""" 
  instanceof_prop = "P31"
  media_company = "Q1331793"

  if len(s.get("claims", {}).get(instanceof_prop, [])) > 0:
    # Check all "instance-of" properties, return true if s is instance of media company
    instancesof = []
    for v in s["claims"][instanceof_prop]:
        # id: "Q123", "numeric-id": 123
        if (
            v["mainsnak"].get("datavalue", {}).get("value", {}).get("id")
            is not None
        ):
            instancesof.append(v["mainsnak"]["datavalue"]["value"]["id"])
    
    return media_company in instancesof
  else:
    return False


def extract_urls(s):
  """Takes a JSON wikidata entry and returns the list of official websites 
  linkedto that entry. The websises are returned as unique first-level-domains: 
  'https://test.google.com/exam/pl/e' becomes 'google.com'. Returns an empty 
  list if  """
  website_prop = "P856"

  # Contains an official website?
  if len(s.get("claims", {}).get(website_prop, [])) > 0:
    urls = []

    # For each website, add fld to urls array
    for v in s["claims"][website_prop]:
      if (v["mainsnak"].get("datavalue", {}).get("value", {}) is not None):
        url = get_fld(v["mainsnak"]["datavalue"]["value"])
        if url not in urls:
          urls.append(url)

    return list(urls)
  else:
    return []


def extract_newspaper_urls(inputf, outputf):
  """Takes an input wikidata dump file and writes a list of newspaper website
  URLs. It filters entries based on if they are media companies. 
  The output file is a list of json objects, where each line is a json object. 
  Each object in the output file references a news agency, with 
    - "id" wikidata identifier of the news agency
    - "label" wikidata label of the news agency
    - "websites" list of urls related to that news agency"""
  # Do not enforce encoding here since the input encoding is correct
  with open(outputf, "w") as output_file:
      with bz2.open(inputf, 'rb') as s_file:
          for instance in s_file:
              instance = instance.decode('utf-8')
              instance = instance[:-2]
              if len(instance)==0:
                  continue
              s = json.loads(instance.strip("\n"))

              if s.get("labels", {}).get("en") is not None:
                  s["label"] = s["labels"]["en"]["value"]
              
              if s.get("labels") is not None:
                  del s["labels"]
              else:
                  continue

              # Only take media companies into consideration
              if not is_media_company(s):
                continue
                
              # Extract Official website 
              s["websites"] = extract_urls(s)

              # Remove leftovers and unnecessary attributes
              if s.get("aliases") is not None:
                  del s["aliases"]
              if s.get("descriptions") is not None:
                  del s["descriptions"]
              if s.get("sitelinks") is not None:
                  del s["sitelinks"]
              if s.get("claims") is not None:
                  del s["claims"]
              if s.get("lastrevid") is not None:
                  del s["lastrevid"]
              if s.get("type") is not None:
                  del s["type"]

              output_file.write(json.dumps(s, ensure_ascii=False) + "\n")

extract_newspaper_urls(FULL_WIKIDUMP, NEWS_URLS)
