In [2]:
import csv
import json
import os
import uuid
import tqdm
import networkx as nx
import re

## Record Chunker

In [3]:
data = []

with open('./business_data/source_data/Place-Complete-v2-2023-09-13.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)
    file.close()
    
#with open('./business_data/source_data/Producing-Objects-v1-2023-08-07.csv') as file:
#    reader = csv.DictReader(file)
#    for row in reader:
#        data.append(row)
#    file.close()

In [4]:
data.sort(key=lambda x: x['ResourceID']) # If needed, reorder

In [5]:
def save_csv_files(data, file_prefix): ## Thank you ChatGPT
    num_records_per_file = 1000
    num_files = len(data) // num_records_per_file + 1

    for i in range(num_files):
        start_index = i * num_records_per_file
        end_index = start_index + num_records_per_file
        records = data[start_index:end_index]

        filename = f"{file_prefix}_{i+1}.csv"
        with open(filename, 'w') as file:
            fieldnames = list(data[0].keys())
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            for row in records:
                writer.writerow(row)
        
        

In [6]:
save_csv_files(data, 'Place')

In [262]:
# concatenate organisations and supplied csvs

# orgs

organisations = []

with open('./business_data/source_data/Organisation-Complete-v3-2023-06-30.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        organisations.append(row)
    file.close()
    
with open('./business_data/source_data/OrganisationsFromObjects-v1-2023-07-11.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        organisations.append(row)
    file.close()

In [263]:
with open('./business_data/source_data/Organisation-Complete-v4-2023-07-11.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=list(organisations[0].keys()))
    writer.writeheader()
    organisations.sort(key=lambda row: row['ResourceID'])
    for row in organisations:
        writer.writerow(row)
    file.close()

In [71]:
# Supplied

supplying = []

with open('business_data/source_data/SuppliedFromMisc-v2-2023-06-14.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        supplying.append(row)
    file.close()
    
with open('business_data/source_data/SuppliedFromRelations-v1-2023-06-14.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        supplying.append(row)
    file.close()

In [73]:
fieldnames = []
for s in supplying:
    for k in s.keys():
        if k not in fieldnames:
            fieldnames.append(k)
            
print(fieldnames)

['Supplying Start Date Value', 'Supplying Start Date Type', 'Supplying Start Date Confidence Note', 'Supplying Start Date Confidence Probability Value', 'Supplying Start Date Confidence Type', 'Supplying Start Date - Additional Notes', 'Supplying Start Date Confidence Assessment - Additional Notes', 'Supplying End Date Value', 'Supplying End Date Type', 'Supplying End Date Confidence Note', 'Supplying End Date Confidence Probability Value', 'Supplying End Date Confidence Type', 'Supplying End Date - Additional Notes', 'Supplying End Date Confidence Assessment - Additional Notes', 'Supplied Objects of Type', 'Supplied Specific Object(s)', 'Supplied Object(s) Produced By', 'Supplying - Additional Notes', 'Supplying - Original SIMON Text', 'ResourceID', 'Supplied By', 'Supplied To', 'Supplying End Date', 'Supplying End Date Confidence Assessment', 'Supplying Start Date', 'Supplying Start Date Confidence Assessment', 'Supplying - Original SIMON Text ']


In [None]:
with open('./business_data/source_data/SupplyingComplete-v1-2023-06-14.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    supplying.sort(key=lambda row: row['ResourceID'])
    for row in supplying:
        writer.writerow(row)
    file.close()

In [7]:
# Concatenate Memberships

guild_memberships = []

misc_memberships = []

with open('./business_data/source_data/Guild-Memberships-v1-2023-06-06.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        guild_memberships.append(row)
    file.close()

with open('./business_data/source_data/MembershipFromMisc-v2-2023-06-19-csv.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        misc_memberships.append(row)
    file.close()

In [8]:
len(guild_memberships)

1841

In [9]:
len(misc_memberships)

143

In [10]:
guild_memberships[0].keys()

dict_keys(['ResourceID', 'Membership Start Date', 'Membership Start Date Confidence Assessment', 'Membership Start Date Confidence Type', 'Membership Start Date Confidence Note', 'Membership Start Date Confidence Probability Value', 'Membership Start Date Confidence Assessment - Additional Notes', 'Membership Start Date - Additional Notes', 'Membership Start Date Type', 'Membership Start Date Value', 'Membership - Original SIMON Text', 'Membership Type', 'Membership End Date', 'Membership End Date Type', 'Membership End Date Confidence Assessment', 'Membership End Date Confidence Note', 'Membership End Date Confidence Probability Value', 'Membership End Date Confidence Assessment - Additional Notes', 'Membership End Date Confidence Type', 'Membership End Date - Additional Notes', 'Membership End Date Value', 'Membership - Additional Notes', 'Member of Organisation', 'Member'])

In [11]:
misc_memberships[0].keys()

dict_keys(['\ufeffMember - Makers_ref', 'Member - Name', 'Member of Organisation - Name', 'Member of Organisation - UUID', 'Membership Start Date Value', 'Membership Start Date Type', 'Membership Start Date Confidence Note', 'Membership Start Date Confidence Probability Value', 'Membership Start Date Confidence Type', 'Membership Start Date  - Additional Notes', 'Membership Start Date Confidence Assessment  - Additional Notes', 'Membership End Date Value', 'Membership End Date Type', 'Membership End Date Confidence Note', 'Membership End Date Confidence Probability Value', 'Membership End Date Confidence Type', 'Membership End Date - Additional Notes', 'Membership End Date Confidence Assessment  - Additional Notes', 'Membership Type', 'Membership - Additional Notes', 'Membership - Original SIMON Text', 'ResourceID', 'Member', 'Member of Organisation'])

In [12]:
set(misc_memberships[0].keys()).intersection(set(guild_memberships[0].keys()))

{'Member',
 'Member of Organisation',
 'Membership - Additional Notes',
 'Membership - Original SIMON Text',
 'Membership End Date - Additional Notes',
 'Membership End Date Confidence Note',
 'Membership End Date Confidence Probability Value',
 'Membership End Date Confidence Type',
 'Membership End Date Type',
 'Membership End Date Value',
 'Membership Start Date Confidence Note',
 'Membership Start Date Confidence Probability Value',
 'Membership Start Date Confidence Type',
 'Membership Start Date Type',
 'Membership Start Date Value',
 'Membership Type',
 'ResourceID'}

In [13]:
guild_memberships[0]

{'ResourceID': 'bd56d3d2-bf1d-3948-9754-32529c66e3cd',
 'Membership Start Date': '',
 'Membership Start Date Confidence Assessment': '',
 'Membership Start Date Confidence Type': '',
 'Membership Start Date Confidence Note': '',
 'Membership Start Date Confidence Probability Value': '',
 'Membership Start Date Confidence Assessment - Additional Notes': '',
 'Membership Start Date - Additional Notes': '',
 'Membership Start Date Type': '',
 'Membership Start Date Value': '',
 'Membership - Original SIMON Text': '',
 'Membership Type': '',
 'Membership End Date': '',
 'Membership End Date Type': '',
 'Membership End Date Confidence Assessment': '',
 'Membership End Date Confidence Note': '',
 'Membership End Date Confidence Probability Value': '',
 'Membership End Date Confidence Assessment - Additional Notes': '',
 'Membership End Date Confidence Type': '',
 'Membership End Date - Additional Notes': '',
 'Membership End Date Value': '',
 'Membership - Additional Notes': '',
 'Member of 

## Duplicate checker

In [118]:
files = {}

directory = os.fsencode('./business_data/')
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"):
        files[filename] = []
        with open(os.path.join(directory, file)) as file:
            reader = csv.DictReader(file)
            for row in reader:
                files[filename].append(row)
            file.close()
        continue
    else:
        continue

In [103]:
# Check for duplicate rows

for filename, records in files.items():
    total_values = len(records)
    unique_values = len(set([json.dumps(x) for x in records]))
    if total_values != unique_values:
        print('{filename}: total: {total_values} unique: {unique_values}'.format(filename=filename, total_values=total_values, unique_values=unique_values))
    

Education.csv: total: 243 unique: 238
Membership_2.csv: total: 984 unique: 983
Supplying.csv: total: 68 unique: 67
Membership_1.csv: total: 1000 unique: 999
Business Appointment.csv: total: 211 unique: 202


In [104]:
# Check for duplicate keys within files

for filename, records in files.items():
    resource_ids = [x['ResourceID'] for x in records]
    total_values = len(records)
    unique_values = len(set([json.dumps(x) for x in records]))
    total_ids = len(resource_ids)
    unique_ids = len(set(resource_ids))
    if unique_ids != unique_values:
        print('{filename}: total_ids: {total_ids} unique_ids: {unique_ids}'.format(filename=filename, total_ids=total_ids, unique_ids=unique_ids))
    

Supplying.csv: total_ids: 68 unique_ids: 66


In [105]:
# Check for duplicate keys across all files

all_resource_ids = []

for filename, records in files.items():
    resource_ids = [x['ResourceID'] for x in records]
    all_resource_ids.extend(resource_ids)

print('total: ' + str(len(all_resource_ids)))
print('unique: ' + str(len(set(all_resource_ids))))

total: 82663
unique: 80081


In [108]:
# We have duplicate keys. Find out where they are

files_with_ids = {}

for filename, records in files.items():
    files_with_ids[filename] = [x['ResourceID'] for x in records]

In [110]:
for filename, ids in tqdm.tqdm(files.items()):
    for resource_id in ids:
        for file_to_check, ids_to_check in files.items():
            if file_to_check == filename:
                continue
            if resource_id in ids_to_check:
                print(file_to_check)
                

100%|███████████████████████████████████████████| 92/92 [01:21<00:00,  1.13it/s]


In [91]:
# Advertisings are the duplicates - let's strip out the duplicate rows

advertising = []

with open('./business_data/source_data/Advertising-v2-2023-06-29.csv') as file:
    reader = csv.reader(file)
    for row in reader:
        if row not in advertising:
            advertising.append(row)
    file.close()

In [93]:
advertising[0]

['MakerID',
 'Advertised Maker',
 'Advertised Maker Occupation',
 'Advertising Start Date Value',
 'Advertising Start Date Type',
 'Advertising Start Date Confidence Note',
 'Advertising Start Date Confidence Probability Value',
 'Advertising Start Date Confidence Type',
 'Advertising Start Date  - Additional Notes',
 'Advertising Start Date Confidence Assessment  - Additional Notes',
 'Advertising End Date Value',
 'Advertising End Date Type',
 'Advertising End Date Confidence Note',
 'Advertising End Date Confidence Probability Value',
 'Advertising End Date Confidence Type',
 'Advertising End Date - Additional Notes',
 'Advertising End Date Confidence Assessment  - Additional Notes',
 'Advertised Objects in Scientific Domain',
 'Advertised Objects of Material',
 'Advertised Objects for use in Occupation',
 'Produced Advertisement',
 'Advertising - Additional Notes',
 'Advertising - Original SIMON Text',
 'ResourceID']

In [95]:
with open('./business_data/source_data/Advertising-v3-2023-06-29.csv', 'w') as file:
    writer = csv.writer(file)
    for row in advertising:
        writer.writerow(row)
    file.close()

In [113]:
total = 0

for filename, records in files.items():
    fields = len(records[0].keys()) * len(records)
    total = total + fields

In [114]:
total

2331627

In [146]:
# Let's make a graph of the whole thing

G = nx.Graph()

# Add nodes
for filename, records in tqdm.tqdm(files.items()):
    props = list(records[0].keys())
    for r in records:
        resource_id = r['ResourceID']
        node_dict = {}
        node_type = filename[0:-4]
        node_dict['Entity Type'] = re.sub(r'_\d+', '', node_type)
        for p in props:
            if p == 'ResourceID':
                continue
            node_dict[p] = r[p]
        G.add_nodes_from([(resource_id, node_dict)])

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 224.78it/s]


In [147]:
# Add edges

for filename, records in tqdm.tqdm(files.items()):
    props = list(records[0].keys())
    for r in records:
        resource_id = r['ResourceID']
        for p in props:
            if 'ontologyProperty' in r[p]:
                relation = json.loads(r[p])
                target_node = relation[0]['resourceId']
                ontology_property = relation[0]['ontologyProperty']
                G.add_edge(resource_id, target_node, ontology_property=ontology_property)
            #try:
            #    relation = json.loads(r[p])
            #    target_node = relation['resourceId']
            #    ontology_property = relation['ontologyProperty']
            #    G.add_edge(resource_id, target_node, ontology_property=ontology_property)
            #except Exception as e:
            #    continue

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 228.29it/s]


In [148]:
len(G.edges)

109744

In [149]:
nx.write_gexf(G, './SEMSIM-total-graph-v1-2023-07-01.gexf')

In [135]:
filename

'Advertising_2.csv'

In [141]:
filename.replace(r'_d', '').replace('.csv', '')

'Advertising_2'

In [142]:
str.replace?

[0;31mSignature:[0m [0mstr[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mold[0m[0;34m,[0m [0mnew[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a copy with all occurrences of substring old replaced by new.

  count
    Maximum number of occurrences to replace.
    -1 (the default value) means replace all occurrences.

If the optional argument count is given, only the first count occurrences are
replaced.
[0;31mType:[0m      method_descriptor

In [145]:
re.sub(r'_\d+', '', filename)

'Advertising.csv'

In [150]:
lines = []
with open('./resource_import.log') as file:
    for line in file:
        lines.append(line)
    file.close()

In [167]:
lines[0].search(r"The related resource with id '[a-zA-Z0-9-]")

AttributeError: 'str' object has no attribute 'search'

In [176]:
m = re.search(r"The related resource with id '[a-zA-Z0-9-]+", lines[0])

In [175]:
re.match(r"The related resource with id '[a-zA-Z0-9-]+", lines[0])

In [172]:
m.group(0).split('\'')[1]

'eaf35b64-e131-32bc-a107-d0269a446ea1'

In [182]:
presumed_missing_ids = []

for l in lines:
    m = re.search(r"The related resource with id '[a-zA-Z0-9-]+", l)
    if m:
        resource_id = m.group(0).split('\'')[1]
        if resource_id not in presumed_missing_ids:
            presumed_missing_ids.append(resource_id)

    

In [208]:
len(presumed_missing_ids)

9778

In [180]:
all_ids = []

for filename, records in tqdm.tqdm(files.items()):
    for r in records:
        resource_id = r['ResourceID']
        if resource_id not in all_ids:
            all_ids.append(resource_id)

100%|███████████████████████████████████████████████████████████████████████████████████| 91/91 [00:22<00:00,  3.97it/s]


In [181]:
len(all_ids)

80081

In [183]:
actually_missing_ids = []

for r_id in presumed_missing_ids:
    if r_id not in all_ids:
        actually_missing_ids.append(r_id)

In [184]:
len(actually_missing_ids)

84

In [198]:
unstitched_records = {}

records_to_create = []

for filename, records in tqdm.tqdm(files.items()):
    fn = filename[0:-4]
    fn = re.sub(r'_\d+', '', fn)
    for row in records:
        row_str = '|'.join([x for x in row.values()])
        relations = re.findall(r'[a-zA-Z0-9-]{36}', row_str)
        for r in relations:
            if r in actually_missing_ids:
                for key, value in row.items():
                    if r in value:
                        records_to_create.append(value)
                if fn not in unstitched_records.keys():
                    unstitched_records[fn] = []
                unstitched_records[fn].append((row, r))
            

100%|███████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 97.42it/s]


AttributeError: 'dict' object has no attribute 'sort'

In [199]:
for filename, records in unstitched_records.items():
    print(filename, len(records))

At Address 174
Advertising 124
Marriage 4
Membership 19
Person 4
Supplying 1
Business Appointment 23
Employment 6
Takeover 37
Apprenticeship 21
Object 19
Royal Appointment 163


In [210]:
missing_makers = []

for r in unstitched_records['At Address']:
    maker_id = r[0]['maker_id']
    if maker_id not in missing_makers:
        missing_makers.append(maker_id)

In [211]:
for r in unstitched_records['Advertising']:
    maker_id = r[0]['MakerID']
    if maker_id not in missing_makers:
        missing_makers.append(maker_id)

In [214]:
for r in unstitched_records['Royal Appointment']:
    maker_id = r[0]['\ufeffRoyal Appointment - Maker Appointed - Makers_ref']
    if maker_id not in missing_makers:
        missing_makers.append(maker_id)

In [215]:
len(missing_makers)

206

In [219]:
unstitched_records['Takeover'][0]

({'ResourceID': '728ed30a-3903-381c-8ddd-37d2d93d20b3',
  'Taken Over By': '[{"inverseOntologyProperty": "http://www.toolsofknowledge.org/SIMEOn#tookOverIn", "ontologyProperty": "http://www.toolsofknowledge.org/SIMEOn#takenOverBy", "resourceId": "355f984e-fc18-3ca2-9e55-475a99eded5f", "resourceXresourceId": ""}]',
  'Took Over': '[{"inverseOntologyProperty": "http://www.toolsofknowledge.org/SIMEOn#takenOverIn", "ontologyProperty": "http://www.toolsofknowledge.org/SIMEOn#tookOver", "resourceId": "d3e3d3d0-02ca-3b2e-9996-349ebf92afce", "resourceXresourceId": ""}]',
  'Takeover - Additional Notes ': '',
  'Takeover Date': '',
  'Takeover Date Confidence Assessment': '',
  'Takeover Date Confidence Type': '',
  'Takeover Date Confidence Note ': '',
  'Takeover Date Confidence Probability Value ': '',
  'Takeover Date Confidence Assessment - Additional Notes ': '',
  'Takeover Date - Additional Notes ': '',
  'Takeover Date Type': '',
  'Takeover Date Value': ''},
 '355f984e-fc18-3ca2-9e55-

In [207]:
len(missing_makers)

83

In [221]:
len(missing_makers)

206

In [235]:
did_not_load = []

for filename, records in files.items():
    if ('Person' in filename):
        for row in records:
            id_to_check = row['Simon ID']
            if id_to_check in missing_makers:
                did_not_load.append(id_to_check)
                
for filename, records in files.items():
    if ('Organisation' in filename):
        for row in records:
            id_to_check = row['SIMON ID']
            if id_to_check in missing_makers:
                did_not_load.append(id_to_check)

In [236]:
len(did_not_load)

123

In [237]:
not_in_data = []

for m in missing_makers:
    if m not in did_not_load:
        not_in_data.append(m)
    

In [238]:
len(not_in_data)

83

In [24]:
# Create a csv of all the concepts and ids in the package

files = []

directory = os.fsencode('./reference_data/concepts/')
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename != 'e7cafa53-d330-3705-b19b-665e4862b0bf.xml':
        continue
    if filename.endswith(".xml"):
        with open(os.path.join(directory, file)) as file:
            file_string = file.read()
            files.append(file_string)
            file.close()

In [25]:
preflabels = []

for f in files:
    matches = re.findall(r'<skos:prefLabel xml:lang="en">\{.*\}', f)
    if len(matches):
        for m in matches:
            preflabels.append(m)

In [26]:
string = '<skos:prefLabel xml:lang="en">{"id": "6ec458a2-36cb-424e-84a6-0773529b05a0", "value": "Fine"}'
re.findall(r'<skos:prefLabel xml:lang="en">\{.*\}', string)

['<skos:prefLabel xml:lang="en">{"id": "6ec458a2-36cb-424e-84a6-0773529b05a0", "value": "Fine"}']

In [27]:
terms = []

for pf in preflabels:
    pf_obj = json.loads(pf[30:])
    terms.append(pf_obj)

In [28]:
with open('./educator-role-types-v1-2023-10-09.csv', 'w') as file:
    fieldnames = list(terms[0].keys())
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for row in terms:
        writer.writerow(row)
    file.close()

In [301]:
preflabels[0]

'<skos:prefLabel xml:lang="en">{"id": "554ca2fa-c779-4b75-bff5-db5ac6443055", "value": "tea"}'

## Reconciliation data

In [2]:
data = []

with open('./business_data/source_data/Place-Complete-v2-2023-09-13.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)
    file.close()

In [3]:
data[0].keys()

dict_keys(['ResourceID', 'Place Name', 'Place - Additional Notes', 'Place - Getty Identifier', 'Place - Geonames Identifier', 'Location', 'Place Documented In Source'])

In [4]:
def save_csv(resource_list, filepath):
    with open(filepath, 'w') as file:
        fieldnames = list(resource_list[0].keys())
        writer = csv.DictWriter(file, fieldnames)
        writer.writeheader()
        for row in resource_list:
            writer.writerow(row)
        file.close()

In [5]:
places = [{'Place Name': x['Place Name'], 'ResourceID': x['ResourceID']} for x in data]

In [6]:
len(places)

2388

In [9]:
save_csv(places, './places-simple-v1-2023-09-26.csv')

In [22]:
data = []

with open('./business_data/source_data/Organisation-Complete-v4-2023-07-11.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)
    file.close()

In [12]:
orgs = [{'Organisation Name': x['Organisation Name'], 'ResourceID': x['ResourceID']} for x in data]

In [13]:
orgs[0]

{'Organisation Name': 'MINNS & CO., Sidney',
 'ResourceID': '00018eea-dcc6-3160-9ccc-7f5a773299eb'}

In [14]:
save_csv(orgs, './organisations-simple-v1-2023-09-26.csv')

In [15]:
data = []

with open('./business_data/source_data/Person-v5-2023-06-08.csv') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)
    file.close()

In [16]:
persons = [{'Given Name': x['Given Name'], 'Surname': x['Surname'], 'ResourceID': x['ResourceID']} for x in data]

In [18]:
save_csv(persons, './persons-simple-v1-2023-09-26.csv')

In [24]:
orgs2 = [{'Simon ID': x['SIMON ID'], 'ResourceID': x['ResourceID']} for x in data]

In [25]:
save_csv(orgs2, './organisations-simonId-to-ResoourceID-v1-2023-09-29.csv')