sourcefinder

#!/bin/bash

# step 0, initialise

cd ~/scripts
# this moves it into a directory which, hopefully will also contain pywikibot!
mkdir -p working
rm working/*
mkdir -p prepped
mkdir -p odnb

curl "https://raw.githubusercontent.com/generalist/sourcefinder/master/citations.tsv" > odnb/citations.tsv

curl -H "Accept: text/tab-separated-values" "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=PREFIX%20schema%3A%20%3Chttp%3A%2F%2Fschema.org%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0A%0ASELECT%20%3Fcid%20%3Fodnb%20%3Farticle%20WHERE%20%7B%0A%20%20%20%20%3Fcid%20wdt%3AP1415%20%3Fodnb%20.%0A%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%3Farticle%20schema%3Aabout%20%3Fcid%20.%0A%20%20%20%20%20%20%3Farticle%20schema%3AinLanguage%20%22en%22%20.%0A%20%20%20%20%20%20%3Farticle%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%0A%20%20%20%20%7D%0A%7D%20" > working/sparqldownload

tr -d '\r' < working/sparqldownload > working/odnb-sparql # to get rid of blasted ^M!

# this file (which takes a couple of secs to generate) is a SPARQL query giving:
# COL 1 - Wikidata item URL
# COL 2 - ODNB number
# COL 3 - enwiki URL if this exists

grep "wikipedia.org" working/odnb-sparql | sed "s/<https:\/\/en.wikipedia.org\/wiki\///g" | sed "s/>//g" | sed "s/%20/_/g" | sed "s/%2C/,/g" | sed "s/%C3%A9/é/g" | sed "s/%C3%B6/ö/g" | sed "s/%C3%93/Ó/g" | sed "s/%27/'/g" | sed "s/%28/(/g" | sed "s/%29/)/g" | sed "s/%E2%80%93/–/g" | sed "s/%C3%AD/í/g" | sed "s/%C5%99/ř/g" | sed "s/%C3%A1/á/g" | sed "s/%C3%9A/Ú/g" | sed "s/%C3%A4/ä/g" | sed "s/%C3%86/Æ/g" | sed "s/%C3%AB/ë/g" | sed "s/%C3%A4/ä/g" | sed "s/%C5%84/ń/g" | sed "s/%C3%BC/ü/g" | sed "s/%C3%BC/ü/g" | cut -f 2-3 | sed "s/^101[0]*/id=/g" | tail -500 > working/odnb-trimmed

# tail -500
# for the purposes of testing , only look at last 500 entries
# this seems to be sorting them randomly, though, which is weird!

cut -f 2 working/odnb-trimmed > working/odnb-namelist

# get the TSV table of WP articles with ODNB cites from quarry- https://quarry.wmflabs.org/query/2337 for all normal doi's

# first, ID the most recent version of the Quarry query

curl "https://quarry.wmflabs.org/query/2337" > working/quarry

RUNID=`cat working/quarry | grep qrun_id | cut -d , -f 4 | cut -d \  -f 3`

curl "https://quarry.wmflabs.org/run/$RUNID/output/0/tsv?download=true" | uniq | sed "s/\/\/dx.doi.org\/10.1093%2Fref:odnb%2F/id=/g" | cut -f 2-3 > working/doilist

cut -f 2 working/doilist | sort | uniq > working/doi-names

# this is now id=xxxx, tab, pagename

# so now we have
# working/odnb-trimmed - sample of "all articles matching a Wikidata ODNB ID"
# working/odnb-namelist - just the page names from above
# citations.tsv - the prettified list of ODNB citations 
# working/doilist - all articles with extant DOI links that reflect an ODNB template
# working/doi-names - just the page names from above

# so! we want to make a list of all articles in the WD list, and for each one, does it have a nice citation?

# this is the absolute crudest first pass - all the articles which have no citeODNB or similar, whether it's relevant or not

grep -v -f working/doi-names working/odnb-namelist > working/articles-no-odnb

# this is a very slow query...

for i in `cat working/articles-no-odnb` ; 
do echo $i ;
grep $i working/odnb-trimmed | sed "s/^101[0]*/id=/g" | sed -e "s/^M//" >> working/odnb-candidates ;
done

# now we have working/odnb-candidates - ID and pagename for everything which definitely doesn't have the template

for i in `cat working/odnb-candidates | cut -f 1` ; do grep "$i|" odnb/citations.tsv >> working/candidates-cited || printf "\tcitation not available\n" >> working/candidates-cited ;  done

paste working/odnb-candidates working/candidates-cited | cut -f 2,4 > working/output

rm prepped/articles-no-odnb-table

# insert suitable headers here
echo -e "{{-start-}}" >> prepped/articles-no-odnb-table
echo -e "'''User:AGbot/Articles_with_no_ODNB_citation'''" >> prepped/articles-no-odnb-table
echo -e "Report of Wikipedia articles matched to ODNB subjects, which do not currently appear to have a {{tl|cite ODNB}} citation template matching the subject of the article. They may contain a differently-formed citation. Generated on "`date +%F`", from [https://quarry.wmflabs.org/query/2337 Quarry run #$RUNID]." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "This is a trial run with only a few hundred entries." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "Source code available at [https://github.com/generalist/sourcefinder/blob/master/sourcefinder github]." >> prepped/articles-no-odnb-table
echo -e "" >> prepped/articles-no-odnb-table
echo -e "{|" >> prepped/articles-no-odnb-table
echo -e "|-" >> prepped/articles-no-odnb-table
cat working/output | sed "s/^/| [[/g" | sed "s/\t/]]\t||<nowiki>/g" | sed "s/$/<\/nowiki>\n|-/g" | grep -v "title=|" | grep -v "citation not available" >> prepped/articles-no-odnb-table
echo -e "|}" >> prepped/articles-no-odnb-table
echo -e "{{-stop-}}" >> prepped/articles-no-odnb-table

# for the time being, this omits any without citations or with malformed citations (no title)

# and then run pywikibot to upload 'table' to the appropriate location

cd ~/scripts/pywiki-wp

python login.py
python pagefromfile.py -force -notitle -file:../prepped/articles-no-odnb-table

# next steps -
# 1. have the report look for "other" citations, eg a URL or DOI, or just a text ref to the ODNB
# # (nb text ref can't be done with Quarry! will need to analyse a DB dump for this)
# then flag these up for possible conversion
# 2. do something about special characters in title (what about apostrophes?)