Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
161 lines (125 sloc) 6.99 KB
#!/bin/bash
# script for creating new MP entries for Wikidata from HoP items which are known not to exist
# requirement - source.tsv, which includes the following columns:
# 1 - HoP URL - only one for now
# 2 - name (if blank, will guess)
# 3 - parliamentary body ("UK", "GB", "ENG"), plus IRE & SCO for pre-union parliaments
# 4 - born - "unk" unknown - 1800
# 5 - died - "unk" unknown - 1800
# 6 - second HoP URL if present
# 7 - third HoP URL if present
# dates can support either year or day resolution, and any prefixed with c will be qualified as "circa"
# it then produces a file suitable to be sent to http://tools.wmflabs.org/wikidata-todo/quick_statements.php
ENTRIES=`cat source.tsv | wc -l`
echo $ENTRIES records to create
rm quickstatements
# make sure it's cleaned up from a previous run
for i in `seq 1 $ENTRIES` ;
do sed "${i}q;d" source.tsv > extract ; # this puts the relevant line into a temp file to save looking it up each time
SLUG=`cut -f 1 extract` ;
NAME=`cut -f 2 extract` ;
BORN=`cut -f 4 extract | sed 's/-/\t/g' | cut -f 1 | sed 's/unk//'` ;
DIED=`cut -f 5 extract | sed 's/-/\t/g' | cut -f 1 | sed 's/unk//'` ;
# tab change here catches any with "c" in dates
if [ `cut -f 3 extract | grep ENG` ] ;
then ADJECTIVE="English" ;
else ADJECTIVE="British" ;
fi
# set adjective to English if served in an English parliament, otherwise British. Not great for Wales but otherwise fine.
# Will mean some post-Union MPs are 'English' not 'British' but this is reasonable
echo CREATE >> quickstatements ;
# this is a person
echo -e LAST"\t"P31"\t"Q5"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
# this person is a man (they all were)
echo -e LAST"\t"P21"\t"Q6581097"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
# this is the primary HoP identity
echo -e LAST"\t"P1614"\t"\"$SLUG\" >> quickstatements ;
# was this person English? (to 1707)
if [ `cut -f 3 extract | grep ENG` ] ; then
echo -e LAST"\t"P39"\t"Q18018860"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
echo -e LAST"\t"P27"\t"Q179876"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ; fi
# was this person British? (1707-1801)
if [ `cut -f 3 extract | grep GB` ] ; then
echo -e LAST"\t"P39"\t"Q18015642"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
echo -e LAST"\t"P27"\t"Q161885"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ; fi
# was this person from the UK? (1801 onwards)
if [ `cut -f 3 extract | grep UK` ] ; then
echo -e LAST"\t"P39"\t"Q16707842"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
echo -e LAST"\t"P27"\t"Q174193"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ; fi
# was this person from Ireland? (to 1801)
if [ `cut -f 3 extract | grep IRE` ] ; then
echo -e LAST"\t"P39"\t"Q18019039"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
echo -e LAST"\t"P27"\t"Q215530"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ; fi
# was this person Scottish? (to 1707)
if [ `cut -f 3 extract | grep SCO` ] ; then
echo -e LAST"\t"P39"\t"Q18928999"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ;
echo -e LAST"\t"P27"\t"Q230791"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements ; fi
# ie, create a P39 for each valid parliamentary type; also includes Ireland and Scotland, for the sake of it
# also creates a nationality line - this is not absolutely perfect and will miss some edge cases, but better than nothing
#now dates, this bit is complex
if [ -z "`cut -f 4 extract | grep c`" ] ; then #this looks to see if it has a c - ie circa
if [ -z "`cut -f 4 extract | grep \-`" ] ; then #this if loop sees if it has a hyphen, ie is day or just year
echo -e LAST"\t"P569"\t"+`cut -f 4 extract`\-00\-00T00\:00\:00Z\/09"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements;
else
echo -e LAST"\t"P569"\t"+`cut -f 4 extract`T00\:00\:00Z\/11 >> quickstatements;
fi
else
# do it with the circas
if [ -z "`cut -f 4 extract | grep \-`" ] ; then #this if loop sees if it has a hyphen, ie is day or just year
echo -e LAST"\t"P569"\t"+`cut -f 4 extract | sed 's/c//'`\-00\-00T00\:00\:00Z\/09"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\""\t"P1480"\t"Q5727902 >> quickstatements;
else
echo -e LAST"\t"P569"\t"+`cut -f 4 extract`T00\:00\:00Z\/11 >> quickstatements;
fi
fi
# now P569 done
if [ -z "`cut -f 5 extract | grep c`" ] ; then #this looks to see if it has a c - ie circa
if [ -z "`cut -f 5 extract | grep \-`" ] ; then
echo -e LAST"\t"P570"\t"+`cut -f 5 extract`\-00\-00T00\:00\:00Z\/09"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\" >> quickstatements;
else
echo -e LAST"\t"P570"\t"+`cut -f 5 extract`T00\:00\:00Z\/11 >> quickstatements;
fi
else
# with circas
if [ -z "`cut -f 5 extract | grep \-`" ] ; then
echo -e LAST"\t"P570"\t"+`cut -f 5 extract | sed 's/c//'`\-00\-00T00\:00\:00Z\/09"\t"S854"\t""\"http://www.historyofparliamentonline.org/volume/"$SLUG\""\t"P1480"\t"Q5727902 >> quickstatements;
else
echo -e LAST"\t"P570"\t"+`cut -f 5 extract`T00\:00\:00Z\/11"\t"P1480"\t"Q5727902 >> quickstatements;
fi
fi
# I do not quite remember what this does but it appears to check for a name and bash up one out of the slug if not present
# may as well leave it in
if [ -z "`cut -f 2 extract`" ] ; then
echo -e LAST"\t"Len"\t"\"`cut -f 1 extract | cut -d / -f 3 | cut -d \- -f 2 | sed 's/^\(.\)/\U\1/'` `cut -f 1 extract | cut -d / -f 3 | cut -d \- -f 1 | sed 's/^\(.\)/\U\1/'`\" >> quickstatements;
else
echo -e LAST"\t"Len"\t"\"`cut -f 2 extract`\" >> quickstatements;
fi
# now the alternate slugs
if [ -n "`cut -f 6 extract`" ] ; then
echo -e LAST"\t"P1614"\t"\"`cut -f 6 extract`\" >> quickstatements ;
fi
if [ -n "`cut -f 7 extract`" ] ; then
echo -e LAST"\t"P1614"\t"\"`cut -f 7 extract`\" >> quickstatements ;
fi
# now the description field, all v clever
if [ -z $BORN ] ; then # if no born date
if [ -z $DIED ] ; then # if no died date
echo -e LAST"\t"Den"\t"\""$ADJECTIVE" Member of Parliament\" >> quickstatements;
# no dates at all
else
echo -e LAST"\t"Den"\t"\""$ADJECTIVE" Member of Parliament \(died "$DIED"\)\" >> quickstatements;
# death only
fi
else # ie we have a born date
if [ -z $DIED ] ; then # if no died date
echo -e LAST"\t"Den"\t"\""$ADJECTIVE" Member of Parliament \(born "$BORN"\)\" >> quickstatements;
# born only
else
echo -e LAST"\t"Den"\t"\""$ADJECTIVE" Member of Parliament \("$BORN"-"$DIED"\)\" >> quickstatements;
# both dates
fi
fi
done
# note that the name guessing code only works for the simplest case - /smith-john, no dates
# still need to do:
# support for unknown dates - "no value" doesn't currently do anything
# add dates into description field