Dataframes based on CSV files images.csv, secties_totaal.csv and register_totaal.csv (all downloaded from www.soundtoll.nl) are created. <br>
Columns are selected and column names are translated. <br>
The method adds a 'reg_id' and 'section_id' derived from register_totaal.csv and secties_totaal.csv to the images data. <br>
The method uses the information about microfilm numbers and scan numbers for registers and sections to add the corresponding reg_id and section_id to the images table. <br>
The result of the notebook is a CSV file 'img.csv' that contains data about customs entries, and their images, registers and sections. <br>

In [2]:
import requests
import pandas as pd
import json

In [4]:
# path to downloaded copies of registers.csv, sections.csv and images.csv
registers = r"C:\STRO10\registers_totaal.csv\registers_totaal.csv" # update filepath if necessary
sections = r"C:\STRO10\secties_totaal.csv\secties_totaal.csv" # update filepath if necessary
images = r"C:\STRO10\images.csv\images.csv" # update filepath if necessary

## 1. Preparation

In [9]:
# retrieve files for preparing entity creation from the STRO 2.0 GitHub repository
owner = 'dhofu'
repo = 'stro20'
column_selection = 'STRO_20_column_selection'
rename_columns = 'STRO_20_column_names'
corrections = 'STRO_20_corrections'
url_selection = f"https://api.github.com/repos/{owner}/{repo}/contents/{column_selection}"
url_rename = f"https://api.github.com/repos/{owner}/{repo}/contents/{rename_columns}"
url_corrections = f"https://api.github.com/repos/{owner}/{repo}/contents/{corrections}"

In [11]:
response_column_selection = requests.get(url_selection)
response_rename_columns = requests.get(url_rename)
response_corrections = requests.get(url_corrections)

In [13]:
# fetch the files for selecting columns from the 'registers', 'sections' and 'images' dataframes
selection_mappings = {
    'usecols_registers': None,
    'usecols_sections': None,
    'usecols_images': None,
    'usecols_remarks': None
}
for file in response_column_selection.json():
    if file['name'].startswith('registers_'):
        key1 = file['name'].split('_', 1)[1].split('.', 1)[0]
        selection_mappings[key1] = requests.get(file['download_url']).json()
    if file['name'].startswith('sections_'):
        key2 = file['name'].split('_', 1)[1].split('.', 1)[0]
        selection_mappings[key2] = requests.get(file['download_url']).json()
    if file['name'].startswith('images_'):
        key3 = file['name'].split('_', 1)[1].split('.', 1)[0]
        selection_mappings[key3] = requests.get(file['download_url']).json()

usecols_registers = selection_mappings['usecols_registers']
usecols_sections = selection_mappings['usecols_sections']
usecols_images = selection_mappings['usecols_images']
usecols_remarks = selection_mappings['usecols_remarks']

In [19]:
# fetch the files for renaming columns from the 'registers', 'sections' and 'images' dataframes 
rename_mappings = {
    'rename_registers': None,
    'rename_sections': None,
    'rename_images': None,
    'rename_remarks': None
}

for file in response_rename_columns.json():
    if file['name'].startswith('registers_'):
        key1 = file['name'].split('_', 1)[1].split('.', 1)[0]
        rename_mappings[key1] = requests.get(file['download_url']).json()
    if file['name'].startswith('sections_'):
        key2 = file['name'].split('_', 1)[1].split('.', 1)[0]
        rename_mappings[key2] = requests.get(file['download_url']).json()
    if file['name'].startswith('images_'):
        key3 = file['name'].split('_', 1)[1].split('.', 1)[0]
        rename_mappings[key3] = requests.get(file['download_url']).json()

rename_registers = rename_mappings['rename_registers']
rename_sections = rename_mappings['rename_sections']
rename_images = rename_mappings['rename_images']
rename_remarks = rename_mappings['rename_remarks']

In [78]:
# fetch the URLs for the corrections to be made to registers and sections dataframe
for file in response_corrections.json():
    if file['name'] == 'registers_changeLastScan.csv':
        corrections_reg = file['download_url']
    if file['name'] == 'sections_changeSectionFirstScanNumber.csv':
        corr_sec_first = file['download_url']
    if file['name'] == 'sections_changeSectionLastScanNumber.csv':
        corr_sec_last = file['download_url']

## 2. Process registers_totaal.csv

In [80]:
# create the registers dataframe from registers_totaal.csv
df_registers = pd.read_csv(registers, sep=",", quotechar='"', \
                         usecols=usecols_registers, header=0, encoding="utf-8").rename(columns=rename_registers).reset_index(names="reg_id")

In [82]:
# insert update of inconsistent values here => load CSV with corrections and add corrections to the dataframe
# corrections_reg is read directly from the GitHub URL fetched above
df_corr_reg = pd.read_csv(corrections_reg, sep=';', encoding='utf-8').drop(columns=['Unnamed: 0'])

In [83]:
# uncomment to have a look at the dataframe
# df_corr_reg.head()

In [84]:
# this iteration updates the values of last_scan in the df_registers based on the corrections stored in df_corr_reg
for i in range(len(df_corr_reg)):
    last_scan = df_corr_reg.iloc[i, 1]
    reg_id = df_corr_reg.iloc[i, 2]
    df_registers.loc[df_registers['reg_id'] == reg_id, 'last_scan'] = last_scan
    i += 1

In [88]:
# save the updated dataframe to a CSV file
df_registers.to_csv(r"C:\STRO20\registers.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

## 3. Process secties_totaal.csv

In [90]:
# create the registers dataframe from secties_totaal.csv
df_sections = pd.read_csv(sections, sep=",", quotechar='"', \
                         usecols=usecols_sections, encoding="utf-8").rename(columns=rename_sections)

In [92]:
# insert update of inconsistent values here => load CSV with corrections and add corrections to the dataframe
# then save the dataframe
df_corr_sec_first = pd.read_csv(corr_sec_first, sep=';', encoding='utf-8').drop(columns=['Unnamed: 0'])

In [551]:
# uncomment to have a look at the dataframe
# df_corr_sec_first.head()

In [94]:
# this iteration updates the values of section_first_scan in the df_sections based on the corrections stored in df_corr_sec_first
for i in range(len(df_corr_sec_first)):
    first_scan = df_corr_sec_first.iloc[i, 1]
    section_id = df_corr_sec_first.iloc[i, 2]
    df_sections.loc[df_sections['section_id'] == section_id, 'section_first_scan'] = first_scan
    i += 1

In [96]:
# insert update of inconsistent values here => load CSV with corrections and add corrections to the dataframe
# then save the dataframe
df_corr_sec_last = pd.read_csv(corr_sec_last, sep=';', encoding='utf-8').drop(columns=['Unnamed: 0'])

In [98]:
# uncomment to view the dataframe
# df_corr_sec_last.head()

In [100]:
# this iteration updates the values of section_first_scan in the df_sections based on the corrections stored in df_corr_sec_first
for i in range(len(df_corr_sec_last)):
    last_scan = df_corr_sec_last.iloc[i, 1]
    section_id = df_corr_sec_last.iloc[i, 2]
    df_sections.loc[df_sections['section_id'] == section_id, 'section_last_scan'] = last_scan
    i += 1

In [102]:
# uncomment to verify results
# df_sections.loc[df_sections['section_id'] == 3307]

In [104]:
# save the dataframe to a CSV file
df_sections.to_csv(r"C:\STRO20\sections.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

## 4. Process images.csv

In [107]:
# create the images dataframe from images.csv
df_images = pd.read_csv(images, sep=",", quotechar='"', \
                         usecols=usecols_images, header=0, encoding="utf-8").rename(columns=rename_images)

In [109]:
df_remarks_images = pd.read_csv(images, sep=",", quotechar='"', \
                         usecols=usecols_remarks, encoding="utf-8", low_memory=False).rename(columns=rename_remarks).reset_index(drop=True)

In [111]:
df_remarks_images.dropna(inplace=True)

In [115]:
# df_remarks_images

In [117]:
# process the information in the filename to separate out microfilm and scan number
df_images[['Sonttolregisters', 'mf_nr', 'scan', 'jpg']] = df_images.filename.str.split(r'-|_|\.', expand=True)
df_images.drop(columns=['Sonttolregisters', 'jpg'], inplace=True)
df_images['scannr'] = df_images['scan'].astype(int)
df_images['microfilm_number'] = df_images['mf_nr'].astype(int)
df_images.drop(columns=['mf_nr', 'scan'], inplace=True)

In [119]:
# uncomment to verify the results
# df_images.loc[df_images['ce_id'] == 1169737]

## 5. Process images and registrations to add reg_id to images

In [121]:
# merge the images and registers dataframe on microfilm number
# the merging produces a lot of false matches that will be filtered out in the next step
df_imgreg = pd.merge(df_images, df_registers, how="left", on="microfilm_number")

In [123]:
# clean the merged dataframe based on a few conditions
# the conditions are that the scan number of the customs entry must be within the range of scans for each register
df_imgreg2 = df_imgreg[['ce_id', 'scannr', 'reg_id']].\
loc[(df_imgreg['scannr'] >= df_imgreg['first_scan']) & (df_imgreg['scannr'] <= df_imgreg['last_scan'])].\
reset_index(drop=True)

In [125]:
# uncomment to verify the results
# df_imgreg2.loc[df_imgreg2['ce_id'] == 1169737]

In [127]:
# from the cleaned dataframe, we only need the reg_id
# matching with the images dataframe will be done based on the index
df_imgreg3 = df_imgreg2[['reg_id']]

## 6. Process images and sections to add section_id to images

In [130]:
# merge the images and registers dataframe on microfilm number
# the merging produces a lot of false matches that will be filtered out in the next step
df_imgsec = pd.merge(df_images, df_sections, how="left", on="microfilm_number")

In [131]:
# clean the merged dataframe based on a few conditions
# the conditions are that the scan number of the customs entry must be within the range of scans for each section
df_imgsec2 = df_imgsec.\
loc[(df_imgsec['scannr'] <= df_imgsec['section_last_scan']) & (df_imgsec['scannr'] >= df_imgsec['section_first_scan'])].\
drop_duplicates(subset=['ce_id', 'scannr'], keep='first', inplace=False).\
reset_index(drop=True)

In [132]:
# uncomment to verify the results
# df_imgsec2.loc[df_imgsec2['ce_id'] == 1169737]

In [133]:
# from the cleaned dataframe, we only need the reg_id
# matching with the images dataframe will be done based on the index
df_imgsec3 = df_imgsec2[['section_id']]

## 7. concatenate images, registers and sections into one dataframe

In [136]:
df_img = pd.concat([df_images, df_imgreg3, df_imgsec3], axis=1)

In [145]:
# uncomment to review the dataframe
# df_img

## 8. Save dataframes to CSV

In [148]:
# save the dataframe to a CSV file
df_img.to_csv(r"C:\STRO20\img.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')

In [149]:
df_remarks_images.to_csv(r"C:\STRO20\remarks_images.csv", sep=';', quotechar= '"', index_label='index', encoding='utf-8')