Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
102 lines (69 sloc) 5.76 KB
#!/bin/bash
# step 0, initialise
cd ~/scripts
# this moves it into a directory which, hopefully will also contain pywikibot!
mkdir -p working
rm working/*
mkdir -p prepped
mkdir -p odnb
curl "https://raw.githubusercontent.com/generalist/sourcefinder/master/citations.tsv" > odnb/citations.tsv
curl -H "Accept: text/tab-separated-values" "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=PREFIX%20schema%3A%20%3Chttp%3A%2F%2Fschema.org%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fcid%20%3Fodnb%20%3Farticle%20WHERE%20%7B%0A%20%20%20%20%3Fcid%20wdt%3AP1415%20%3Fodnb%20.%0A%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%3Farticle%20schema%3Aabout%20%3Fcid%20.%0A%20%20%20%20%20%20%3Farticle%20schema%3AinLanguage%20%22en%22%20.%0A%20%20%20%20%20%20%3Farticle%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%0A%20%20%20%20%7D%0A%7D%20" > working/sparqldownload
tr -d '\r' < working/sparqldownload > working/odnb-sparql # to get rid of blasted ^M!
# this file (which takes a couple of secs to generate) is a SPARQL query giving:
# COL 1 - Wikidata item URL
# COL 2 - ODNB number
# COL 3 - enwiki URL if this exists
grep "wikipedia.org" working/odnb-sparql | sed "s/<https:\/\/en.wikipedia.org\/wiki\///g" | sed "s/>//g" | sed "s/%20/_/g" | sed "s/%2C/,/g" | sed "s/%C3%A9/é/g" | sed "s/%C3%B6/ö/g" | sed "s/%C3%93/Ó/g" | sed "s/%27/'/g" | sed "s/%28/(/g" | sed "s/%29/)/g" | sed "s/%E2%80%93/–/g" | sed "s/%C3%AD/í/g" | sed "s/%C5%99/ř/g" | sed "s/%C3%A1/á/g" | sed "s/%C3%9A/Ú/g" | sed "s/%C3%A4/ä/g" | sed "s/%C3%86/Æ/g" | sed "s/%C3%AB/ë/g" | sed "s/%C3%A4/ä/g" | sed "s/%C5%84/ń/g" | sed "s/%C3%BC/ü/g" | sed "s/%C3%BC/ü/g" | cut -f 2-3 | sed "s/^101[0]*/id=/g" | tail -500 > working/odnb-trimmed
# tail -500
# for the purposes of testing , only look at last 500 entries
# this seems to be sorting them randomly, though, which is weird!
cut -f 2 working/odnb-trimmed > working/odnb-namelist
# get the TSV table of WP articles with ODNB cites from quarry- https://quarry.wmflabs.org/query/2337 for all normal doi's
# first, ID the most recent version of the Quarry query
curl "https://quarry.wmflabs.org/query/2337" > working/quarry
RUNID=`cat working/quarry | grep qrun_id | cut -d , -f 4 | cut -d \ -f 3`
curl "https://quarry.wmflabs.org/run/$RUNID/output/0/tsv?download=true" | uniq | sed "s/\/\/dx.doi.org\/10.1093%2Fref:odnb%2F/id=/g" | cut -f 2-3 > working/doilist
cut -f 2 working/doilist | sort | uniq > working/doi-names
# this is now id=xxxx, tab, pagename
# so now we have
# working/odnb-trimmed - sample of "all articles matching a Wikidata ODNB ID"
# working/odnb-namelist - just the page names from above
# citations.tsv - the prettified list of ODNB citations
# working/doilist - all articles with extant DOI links that reflect an ODNB template
# working/doi-names - just the page names from above
# so! we want to make a list of all articles in the WD list, and for each one, does it have a nice citation?
# this is the absolute crudest first pass - all the articles which have no citeODNB or similar, whether it's relevant or not
grep -v -f working/doi-names working/odnb-namelist > working/articles-no-odnb
# this is a very slow query...
for i in `cat working/articles-no-odnb` ;
do echo $i ;
grep $i working/odnb-trimmed | sed "s/^101[0]*/id=/g" | sed -e "s/^M//" >> working/odnb-candidates ;
done
# now we have working/odnb-candidates - ID and pagename for everything which definitely doesn't have the template
for i in `cat working/odnb-candidates | cut -f 1` ; do grep "$i|" odnb/citations.tsv >> working/candidates-cited || printf "\tcitation not available\n" >> working/candidates-cited ; done
paste working/odnb-candidates working/candidates-cited | cut -f 2,4 > working/output
rm prepped/articles-no-odnb-table
# insert suitable headers here
echo -e "{{-start-}}" >> prepped/articles-no-odnb-table
echo -e "'''User:AGbot/Articles_with_no_ODNB_citation'''" >> prepped/articles-no-odnb-table
echo -e "Report of Wikipedia articles matched to ODNB subjects, which do not currently appear to have a {{tl|cite ODNB}} citation template matching the subject of the article. They may contain a differently-formed citation. Generated on "`date +%F`", from [https://quarry.wmflabs.org/query/2337 Quarry run #$RUNID]." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "This is a trial run with only a few hundred entries." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "Source code available at [https://github.com/generalist/sourcefinder/blob/master/sourcefinder github]." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "{|" >> prepped/articles-no-odnb-table
echo -e "|-" >> prepped/articles-no-odnb-table
cat working/output | sed "s/^/| [[/g" | sed "s/\t/]]\t||<nowiki>/g" | sed "s/$/<\/nowiki>\n|-/g" | grep -v "title=|" | grep -v "citation not available" >> prepped/articles-no-odnb-table
echo -e "|}" >> prepped/articles-no-odnb-table
echo -e "{{-stop-}}" >> prepped/articles-no-odnb-table
# for the time being, this omits any without citations or with malformed citations (no title)
# and then run pywikibot to upload 'table' to the appropriate location
cd ~/scripts/pywiki-wp
python login.py
python pagefromfile.py -force -notitle -file:../prepped/articles-no-odnb-table
# next steps -
# 1. have the report look for "other" citations, eg a URL or DOI, or just a text ref to the ODNB
# # (nb text ref can't be done with Quarry! will need to analyse a DB dump for this)
# then flag these up for possible conversion
# 2. do something about special characters in title (what about apostrophes?)