In [50]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import xml.etree.ElementTree as et
from zlib import crc32 
import xml.dom.minidom as md
import html
import yaml

In [13]:
thesis_df = pd.read_excel('ThesesDissertations.xlsx')
thesis_df = thesis_df[thesis_df['Institution'] == 'University of Waterloo']
thesis_df = thesis_df.dropna(subset=['URI'])

In [14]:
meta_data = []
for url in thesis_df[thesis_df['URI'].str.startswith("http://hdl.handle.net/")]['URI']:
    try:
        fp = urllib.request.urlopen(url.replace('http://hdl.handle.net/','https://uwspace.uwaterloo.ca/handle/').strip()+'?show=full')
        mystr = fp.read().decode("utf8")
        fp.close()

        soup = BeautifulSoup(mystr)
        table = soup.find("table", attrs={"class":"ds-includeSet-table detailtable table table-striped table-hover"})
        link = soup.find("div", attrs={"class":"file-link col-xs-6 col-xs-offset-6 col-sm-2 col-sm-offset-0"}).find('a').attrs['href']
        thesis = {tr.findAll('td')[0].text: tr.findAll('td')[1].text for tr in table.findAll('tr')}
        # thesis.append(['link','https://uwspace.uwaterloo.ca/'+link])
        thesis['link'] = 'https://uwspace.uwaterloo.ca/'+link
        meta_data.append(thesis)
    except:
        print(url.replace('http://hdl.handle.net/','https://uwspace.uwaterloo.ca/handle/')+'?show=full')

    

https://uwspace.uwaterloo.ca/handle/10315/39093?show=full


In [55]:
def handle_HTML_entities(file_name: str):

    tree = et.parse(file_name)
    for element in tree.iter():
        text = element.text
        if text: element.text = html.unescape(text)

    with open (file_name, "wb") as files :
        tree.write(files, encoding='UTF-8', xml_declaration=True, method='xml')

def compute_hash(value: bytes) -> str:
    checksum = crc32(value) & 0xFFFFFFFF
    return f"{checksum:08x}"

def generate_bibkey(title, author, year, bibkey_list):
    author_list = [(item['family'] if 'family' in item else '') for item in author] + [(item['given'] if 'given' in item else '') for item in author]    
    title_list = title.split()
    count=1
    while True:
        bibkey = '-'.join(author_list[:count] + [str(year)] + title_list[:count])
        if bibkey not in bibkey_list: return bibkey
        count += 1
    
paper_dict = {}

for item in meta_data:
    year = item['dc.date.issued'][:4]
    pub_key = item['uws-etd.degree.department'] +', '+ item['dc.type']
    if year not in paper_dict: paper_dict[year]={}
    if pub_key not in paper_dict[year]: paper_dict[year][pub_key]={}
    paper_dict[year][pub_key][item['dc.identifier.uri']] = item

for year, year_dict in paper_dict.items():        
    tag_collection = et.Element("collection")
    tag_collection.set('id', "G"+str(year)[-2:])

    bibkey_list = []

    for index, (volume, item) in enumerate(year_dict.items()):
        tag_vol = et.Element("volume")
        tag_collection.append(tag_vol)
        tag_vol.set('id', str(index+1))

        first_item = list(item.items())[0][1]

        tag_meta = et.Element("meta")
        tag_vol.append(tag_meta)    
        tag_subelement = et.SubElement(tag_meta, "booktitle")
        tag_subelement.text = volume
        tag_subelement = et.SubElement(tag_meta, "publisher")
        tag_subelement.text = first_item['dc.publisher']
        tag_subelement = et.SubElement(tag_meta, "address")
        tag_subelement.text = ""
        tag_subelement = et.SubElement(tag_meta, "year")
        tag_subelement.text = str(year)          

        tmep_dict = year_dict[volume]
        for idx, (key, tmep_dict_item) in enumerate(tmep_dict.items()):
            tag_paper = et.Element("paper")
            tag_vol.append(tag_paper)
            tag_paper.set('id', str(idx+1))
            tag_subelement = et.SubElement(tag_paper, "title")
            tag_subelement.text = tmep_dict_item['dc.title']

            tag_subelement = et.SubElement(tag_paper, "author")
            author_name = tmep_dict_item['dc.contributor.author'].split(',')
            tag_subsubelement = et.SubElement(tag_subelement, "first")
            tag_subsubelement.text = author_name[1].strip()
            tag_subsubelement = et.SubElement(tag_subelement, "last")
            tag_subsubelement.text = author_name[0].strip()

            tag_subelement = et.SubElement(tag_paper, "abstract")
            tag_subelement.text = tmep_dict_item['dc.description.abstract'].replace('\r\n', ' ')
            tag_subelement = et.SubElement(tag_paper, "url")
            tag_subelement.text =  f'G{str(year)[-2:]}-{index+1}{(idx+1):03}'
            tag_subelement.set('hash', compute_hash(str.encode(tag_subelement.text)))
            
            # if 'page' in tmep_dict_item:
            #     tag_subelement = et.SubElement(tag_paper, "pages")
            #     tag_subelement.text = tmep_dict_item['page']

            tag_subelement = et.SubElement(tag_paper, "doi")
            tag_subelement.text = tmep_dict_item['dc.identifier.uri'].replace('http://hdl.handle.net/', '')
            
            tag_subelement = et.SubElement(tag_paper, "bibkey")
            bibkey = generate_bibkey(tmep_dict_item['dc.title'], [{'given': author_name[1].strip(),'family':author_name[0].strip()}], year, bibkey_list)
            tag_subelement.text = bibkey
            bibkey_list.append(bibkey)

    tree = et.ElementTree(tag_collection)
    file_name = "G"+str(year)[-2:]+".xml"
    with open (file_name, "wb") as files :
        tree.write(files, encoding='UTF-8', xml_declaration=True)
    
    xml_pretty_str = md.parse(file_name)
    xml_pretty_str = xml_pretty_str.toprettyxml(encoding='UTF-8').decode()
    with open(file_name, "w", encoding="utf-8") as f:
        f.write(xml_pretty_str)

    handle_HTML_entities(file_name)    

dict_file = []
for item in meta_data:
    author_name = item['dc.contributor.author'].split(',')
    element = {'canonical' : {'first': author_name[1].strip(), 'last': author_name[0].strip()}, 'id':author_name[1].strip().replace('.', '')+'-'+author_name[0].strip().replace('.', '')}
    if element not in dict_file: dict_file.append(element)

with open(r'name_variants.yaml', 'w') as file:
    documents = yaml.dump(dict_file, file, default_flow_style=None)