In [7]:
import pywikibot
from pywikibot import pagegenerators as pg
import pandas as pd

## 1- Query wikidata with pywikibot

#### First let's have a look at how to query the information about a journal to wikidata

In [2]:
newspapers = ["The New York Times", "The Washington Times", "Business Insider", "Fox News", "NBC News",  "NBC Sports", "The Seattle Times", "New Haven Register"]
owned_by = []

In [3]:
for newspaper in newspapers:
    print(newspaper)
    site = pywikibot.Site("en", "wikipedia")
    page = pywikibot.Page(site, newspaper)
    item = pywikibot.ItemPage.fromPage(page)
    repo = site.data_repository()
    claims = item.get()['claims'].toJSON()
    qid = claims['P127'][0]['mainsnak']['datavalue']['value']['numeric-id']
    data_found = pywikibot.ItemPage(repo, "Q" + str(qid))
    owned_by.append(data_found.get()['labels']['en'])

The New York Times
The Washington Times
Business Insider
Fox News
NBC News
NBC Sports
The Seattle Times
New Haven Register


In [4]:
#Companies that own our journals
owned_by

['The New York Times Company',
 'Unification Church',
 'Insider Inc.',
 'Fox Corporation',
 'NBCUniversal',
 'NBCUniversal',
 'The Seattle Times Company',
 '21st Century Media']

## 2 - Unfortunately we don't have the correct names of the journals in the quotes, therefore we need a way to link the names of the journals we found in the quotes to the wikidata pages

#### To cope with this issue, we will get all the aliases from the wiki dump to be able to link them

In [None]:
#load wiki dump
df = pd.read_csv("C:\\Users\\hugol\\Desktop\\ada-2021-project-mahj\\wikidata_labels_descriptions_quotebank.csv.bz2", compression='bz2')


In [74]:
no_qa = df.dropna()
no_qa[no_qa['Label'].str.contains("New York")]
#We see the new york times has QID Q9684

Unnamed: 0,QID,Label,Description
2315,Q5163079,Conservative Party of New York State,Conservative third party in the United States
2499,Q6540770,Liberal Party of New York,Third party only active in state of New York
2573,Q7013922,New York Republican State Committee,Political party in New York
4667,Q1384,New York,state of the United States of America
5299,Q575380,1801 New York gubernatorial election,gubernatorial election held in April 1801
10905,Q2513114,New York State Right to Life Party,Anti-abortion third party in New York
13513,Q65047185,2022 New York gubernatorial election,election for Governor of New York
19985,Q1540332,1777 New York gubernatorial election,gubernatorial election
21140,Q7014067,New York Democratic Party,"political organization in New York, U.S."
24807,Q2413088,The New York Times Book Review,weekly review of books by the New York Times


In [81]:
#Let's find the aliases for the new york times now 
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()

#for i in df.QID: #Here we only do it for the new york times QID
    item = pywikibot.ItemPage(repo, "Q9684")
    print(item.get()['aliases']['en'])


['New York Times', 'The Gray Lady', 'nytimes.com', 'New-York Daily Times', 'The NY Times', 'N. Y. Times', 'The New York Daily Times', 'New York Times (newspaper)', 'N Y Times', '@nytimes', 'The NYT', 'The Paper of Record', 'NYTimes', 'The Sunday Times', 'NYT', 'The Times', 'Times']


In [82]:
#We have found all the aliases of new york times, in the quotes dataset we extracted
# "nytimes.com" which is indeed present in the aliase

In [None]:
#so now we know that "nytimes.com" has QID 9684 in wikidata
#We can do this for all the journals we have extracted from the quotes