In [25]:
import pandas as pd
import csv

In [26]:
# Om te beginnen laden we de gecureerde versie van de WIC-opvarenden database in
opvarenden_curated = pd.read_excel('validated 6.xlsx')

Tijdelijk: we verwijderen gesplitste data ('/') uit de dataset. 

In [27]:
# Filter alle rijen met '/' erin ...
columns_to_check_excluding_role = ['name', 'location', 'ship_name', 'organization', 'captain', 'Bestemming', 'final_creditor_name', 'final_debt_amount_int']
rows_to_remove = opvarenden_curated[opvarenden_curated[columns_to_check_excluding_role].apply(lambda x: x.str.contains('/')).any(axis=1)]

# ... en verwijder die!
opvarenden_curated = opvarenden_curated.drop(rows_to_remove.index)

In [28]:
# Eerst proberen we de namen van schuldeisers te normaliseren aan de hand van een vooraf voorbereide lijst
normalized_creditors = {}
with open('normalized_creditors_gdk.csv', mode='r') as infile:
    reader = csv.reader(infile, delimiter=';')
    next(reader) # header
    for row in reader:
        if row[1].strip():  # Check  of row[1] niet leeg is
            normalized_creditors[row[0]] = row[1]

opvarenden_curated['normalized_creditor_name'] = opvarenden_curated['final_creditor_name'].map(normalized_creditors)
opvarenden_curated['normalized_creditor_name'] = opvarenden_curated['normalized_creditor_name'].fillna(opvarenden_curated['final_creditor_name'])

In [29]:
# Nu hetzelfde voor schepen
normalized_ships = {}
with open('normalized_ships_gdk.csv', mode='r') as infile:
    reader = csv.reader(infile, delimiter=';')
    next(reader) # header
    for row in reader:
        normalized_ships[row[0]] = row[1]

opvarenden_curated['normalized_ship_name'] = opvarenden_curated['ship_name'].map(normalized_ships)
opvarenden_curated['normalized_ship_name'] = opvarenden_curated['normalized_ship_name'].fillna(opvarenden_curated['ship_name'])

In [30]:
# Nu laden we de locaties in 
temp_locations = pd.read_excel('normalized_locations_gdk.xlsx')

# Eerst maken we een lijst met alle unieke locaties
unique_locations = temp_locations[['place_standardized', 'country_code', 'geonames_uri', 'latitude', 'longitude']].drop_duplicates()

# er zitten nog wat NaNs in
unique_locations = unique_locations.dropna(subset=['place_standardized', 'geonames_uri'])

# We gebruiken betere namen voor de kolommen
Locations = unique_locations.rename(columns={
    'place_standardized': 'label',
    'country_code': 'country',
    'geonames_uri': 'geonames_uri',
    'latitude': 'latitude',
    'longitude': 'longitude'
})

# We geven iedere locatie een id
Locations.insert(0, 'location_id', range(1, 1 + len(Locations)))

# Dict om (label, geonames_uri) naar location_id te mappen
location_to_index = { (row['label'], row['geonames_uri']): row['location_id'] for _, row in Locations.iterrows() }

# wic_location geeft een link naar de locatie 
temp_locations['wic_location'] = temp_locations.apply(lambda row: location_to_index.get((row['place_standardized'], row['geonames_uri']), None) if pd.notna(row['place_standardized']) and pd.notna(row['geonames_uri']) else None, axis=1)

# Tijdelijk? Voor locaties zonder Geonames koppeling, maar wel een country_code
new_locations = pd.DataFrame([
    {'location_id': 2000, 'label': 'Frankrijk (land)', 'country': 'FR', 'geonames_uri': 'http://sws.geonames.org/3017382/', 'latitude': 46.603354, 'longitude': 1.888334},
    {'location_id': 2001, 'label': 'Ierland (land)', 'country': 'IE', 'geonames_uri': 'http://sws.geonames.org/2963597/', 'latitude': 53.41291, 'longitude': -8.24389},
    {'location_id': 2002, 'label': 'Groot-Brittannië (land)', 'country': 'GB', 'geonames_uri': 'http://sws.geonames.org/2635167/', 'latitude': 54.7023545, 'longitude': -3.2765753},
    {'location_id': 2003, 'label': 'Nederland (land)', 'country': 'NL', 'geonames_uri': 'http://sws.geonames.org/2750405/', 'latitude': 52.5, 'longitude': 5.75},
    {'location_id': 2004, 'label': 'België (land)', 'country': 'BE', 'geonames_uri': 'http://sws.geonames.org/2802361/', 'latitude': 50.5, 'longitude': 4.5},
    {'location_id': 2005, 'label': 'Noorwegen (land)', 'country': 'NO', 'geonames_uri': 'http://sws.geonames.org/3144096/', 'latitude': 60.472024, 'longitude': 8.468946},
    {'location_id': 2006, 'label': 'Zweden (land)', 'country': 'SE', 'geonames_uri': 'http://sws.geonames.org/2661886/', 'latitude': 60.128161, 'longitude': 18.643501},
    {'location_id': 2007, 'label': 'Duitsland (land)', 'country': 'DE', 'geonames_uri': 'http://sws.geonames.org/2921044/', 'latitude': 51.0834196, 'longitude': 10.4234469},
    {'location_id': 2008, 'label': 'Denemarken (land)', 'country': 'DK', 'geonames_uri': 'http://sws.geonames.org/2623032/', 'latitude': 56.26392, 'longitude': 9.501785},
    {'location_id': 2009, 'label': 'Finland (land)', 'country': 'FI', 'geonames_uri': 'http://sws.geonames.org/660013/', 'latitude': 61.92411, 'longitude': 25.748151},
])
Locations = pd.concat([Locations, new_locations], ignore_index=True)

# zorg dat wic_locatie een integer is
temp_locations['wic_location'] = temp_locations['wic_location'].astype('Int64')

# Reset index
Locations = Locations.reset_index(drop=True)

In [31]:
# Nu gaan we aparte tabellen maken voor entiteiten, te beginnen met personen

In [32]:
# We maken een df voor Personen en voegen notaris Henrick Schaeff handmatig toe
Persons = pd.DataFrame(columns=['person_id', 'name', 'uri', 'role', 'location_string', 'location_uri'])
henrick = pd.DataFrame({
    'person_id': [1],
    'name': ['Henrick Schaeff'],
    'uri': [''],
    'role': ['notary'],
    'location_string': [''],
    'location_uri': ['']
})
Persons = pd.concat([Persons, henrick], ignore_index=True)

# df voor akten
Deeds = pd.DataFrame(columns=['deed_id', 'deed_uri', 'notary_id', 'deed_date'])

# df voor transacties
Transactions = pd.DataFrame(columns=['deed_id', 'sailor_id', 'creditor_id', 'final_debt_amount_int'])

# We voeden bovenstaande dfs met data uit de ingeladen WIC-opvarenden database
person_id_counter = Persons['person_id'].max() + 1
deed_id_counter = 1

for index, row in opvarenden_curated.iterrows():
    sailors = row['name'].split('/')
    sailor_uri = row['sailor_uri'] if pd.notna(row['sailor_uri']) else ''
    creditors = str(row['normalized_creditor_name']).split('/') if pd.notna(row['normalized_creditor_name']) else []
    deed_uri = row['deed_uri'] if pd.notna(row['deed_uri']) else ''
    final_debt_amount_int = row['final_debt_amount_int']
    deed_date = row['correct_deed_date'] if pd.notna(row['correct_deed_date']) else row['deed_date']
    location_string = row['location'] if pd.notna(row['location']) else ''
    location_uri = row['location_uri'] if pd.notna(row['location_uri']) else ''
    
    # Sailors gaan altijd naar Persons (geen check of ze dubbel zijn)
    for sailor in sailors:
        person = pd.DataFrame({
            'person_id': [person_id_counter],
            'name': [sailor],
            'uri': [sailor_uri if len(sailors) == 1 else ''],
            'role': ['sailor'],
            'location_string': [location_string],
            'location_uri': [location_uri]
        })
        Persons = pd.concat([Persons, person], ignore_index=True)
        sailor_id = person_id_counter
        person_id_counter += 1
    
    # Voor schuldeisers (die heel vaak voorkomen en gestandaardiseerde namen hebben) wel een check
    for creditor in creditors:
        existing_creditor = Persons[(Persons['name'] == creditor) & (Persons['role'] == 'creditor')]
        if existing_creditor.empty:
            person = pd.DataFrame({
                'person_id': [person_id_counter],
                'name': [creditor],
                'uri': [''],
                'role': ['creditor'],
                'location_string': [''],
                'location_uri': ['']
            })
            Persons = pd.concat([Persons, person], ignore_index=True)
            creditor_id = person_id_counter
            person_id_counter += 1
        else:
            creditor_id = existing_creditor['person_id'].values[0]
    
    # Voeg de akte toe aan Deeds
    deed = pd.DataFrame({
        'deed_id': [deed_id_counter],
        'deed_uri': [deed_uri],
        'notary_id': [1],  # Henrick Schaeff is 1
        'deed_date': [deed_date]
    })
    Deeds = pd.concat([Deeds, deed], ignore_index=True)
    deed_id = deed_id_counter
    deed_id_counter += 1
    
    # Transacties tussen opvarenden en schuldeisers gaan naar Transactions
    for sailor in sailors:
        for creditor in creditors:
            transaction = pd.DataFrame({
                'deed_id': [deed_id],
                'sailor_id': [sailor_id],
                'creditor_id': [creditor_id],
                'final_debt_amount_int': [final_debt_amount_int]
            })
            Transactions = pd.concat([Transactions, transaction], ignore_index=True)

In [33]:
# Voeg de locaties toe aan de Persons DataFrame
Persons['location_standardized'] = None

# Dict om deed_id naar deed_uri te mappen voor snelle toegang
deed_id_to_uri = Deeds.set_index('deed_id')['deed_uri'].to_dict()

# We itereren over iedere persoon in Persons ...
for idx, person in Persons[Persons['role'] == 'sailor'].iterrows():
    sailor_id = person['person_id']
    # ... zoeken naar de gekoppelde transacties ...
    sailor_transactions = Transactions[Transactions['sailor_id'] == sailor_id]
    for _, transaction in sailor_transactions.iterrows():
        deed_id = transaction['deed_id']
        deed_uri = deed_id_to_uri.get(deed_id, None)
        if deed_uri:
            # ... en zoeken naar de locatie van de schuldeiser
            temp_location_matches = temp_locations[(temp_locations['deed_uri'] == deed_uri) & (temp_locations['name'] == person['name'])]
            if not temp_location_matches.empty:
                # als er meerdere matches zijn, kies de eerste
                wic_location = temp_location_matches.iloc[0]['wic_location']
                country_code = temp_location_matches.iloc[0]['country_code']
                if pd.notna(wic_location):
                    Persons.at[idx, 'location_standardized'] = wic_location
                    break # we hebben een match gevonden, dus we kunnen stoppen

                # Tijdelijke (?) oplossing voor locaties zonder Geonames koppeling
                if pd.isna(wic_location) & pd.notna(country_code):
                    if country_code == 'FR':
                        Persons.at[idx, 'location_standardized'] = 2000
                    elif country_code == 'IE':
                        Persons.at[idx, 'location_standardized'] = 2001
                    elif country_code == 'GB':
                        Persons.at[idx, 'location_standardized'] = 2002
                    elif country_code == 'NL':
                        Persons.at[idx, 'location_standardized'] = 2003
                    elif country_code == 'BE':
                        Persons.at[idx, 'location_standardized'] = 2004
                    elif country_code == 'NO':
                        Persons.at[idx, 'location_standardized'] = 2005
                    elif country_code == 'SE':
                        Persons.at[idx, 'location_standardized'] = 2006
                    elif country_code == 'DE':
                        Persons.at[idx, 'location_standardized'] = 2007
                    elif country_code == 'DK':
                        Persons.at[idx, 'location_standardized'] = 2008
                    elif country_code == 'FI':
                        Persons.at[idx, 'location_standardized'] = 2009

In [34]:
# Nu gaan we individuele reizen van schepen in een aparte tabel zetten

In [35]:
# We werken met oude datums, dus deed date moet een string zijn
opvarenden_curated['deed_date'] = opvarenden_curated['deed_date'].astype(str)

# Initialiseer counter voor Voyages
global_voyage_id = 0

# Functie om afzonderlijke reizen te detecteren
def detect_voyages(group):
    global global_voyage_id
    group = group.sort_values('deed_date').reset_index(drop=True)
    if pd.notna(group['normalized_ship_name'].iloc[0]):  # Is er een genormaliseerde schipnaam?
        group['deed_date_period'] = group['deed_date'].apply(lambda x: pd.Period(x, freq='D'))
        group['voyage_id'] = group['deed_date_period'].diff().apply(lambda x: x.n > 180 if pd.notna(x) else False).cumsum() # meer dan 180 dagen verschil? nieuwe reis!
        group['voyage_id'] += global_voyage_id
        global_voyage_id = group['voyage_id'].max() + 1  # id ophogen
    else:
        group['voyage_id'] = pd.NA  # NA als er geen genormaliseerde schipnaam is
    return group

# We voeren deze functie uit voor iedere combinatie van schip en organisatie
opvarenden_curated_updated = opvarenden_curated.groupby(['normalized_ship_name', 'organization']).apply(detect_voyages).reset_index(drop=True)

# Aanmaken Voyages df
Voyages = opvarenden_curated_updated.groupby('voyage_id').agg(
    ship_name=('normalized_ship_name', 'first'),
    organization=('organization', 'first'),
    first_deed_date=('deed_date', 'first'),
    last_deed_date=('deed_date', 'last')
).reset_index()

# Updaten opvarenden_curated met nieuwe informatie
opvarenden_curated = opvarenden_curated_updated


  opvarenden_curated_updated = opvarenden_curated.groupby(['normalized_ship_name', 'organization']).apply(detect_voyages).reset_index(drop=True)


In [36]:
# Nu voegen we de Voyage_ids to aan Transactions
Transactions['voyage_id'] = None

# Dict om deed_id naar uri te mappen voor snelle lookup
deed_id_to_uri = pd.Series(Deeds.deed_uri.values, index=Deeds.deed_id).to_dict()

# Dict omt uri naar voayge id te mappen voor snelle lookup
deed_uri_to_voyage_id = pd.Series(opvarenden_curated_updated.voyage_id.values, index=opvarenden_curated_updated.deed_uri).to_dict()

# Update voyage_id in Transactions
for index, row in Transactions.iterrows():
    deed_id = row['deed_id']
    if deed_id in deed_id_to_uri:
        deed_uri = deed_id_to_uri[deed_id]
        if deed_uri in deed_uri_to_voyage_id:
            Transactions.at[index, 'voyage_id'] = deed_uri_to_voyage_id[deed_uri]

In [37]:
# Zorg dat location_standardized een integer is
Persons['location_standardized'] = Persons['location_standardized'].astype('Int64')

Nu hebben we alle afzonderlijke dataframes. Tijd om een SQLite-database te maken!

In [38]:
import sqlite3

conn = sqlite3.connect('wic-opvarenden.db')
cursor = conn.cursor()

# Drop tables als ze al bestaan
cursor.execute('DROP TABLE IF EXISTS Deeds')
cursor.execute('DROP TABLE IF EXISTS Locations')
cursor.execute('DROP TABLE IF EXISTS Persons')
cursor.execute('DROP TABLE IF EXISTS Transactions')
cursor.execute('DROP TABLE IF EXISTS Voyages')

# Maak benodigde tables aan (inclusief foreign key-relaties)
cursor.execute('''
    CREATE TABLE IF NOT EXISTS Deeds (
        deed_id INTEGER PRIMARY KEY,
        deed_uri TEXT,
        notary_id INTEGER,
        deed_date TEXT,
        FOREIGN KEY (notary_id) REFERENCES Persons(person_id)
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Locations (
        location_id INTEGER PRIMARY KEY,
        label TEXT,
        country TEXT,
        geonames_uri TEXT,
        latitude REAL,
        longitude REAL
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Persons (
        person_id INTEGER PRIMARY KEY,
        name TEXT,
        uri TEXT,
        role TEXT,
        location_string TEXT,
        location_uri TEXT,
        location_standardized INTEGER,
        FOREIGN KEY (location_standardized) REFERENCES Locations(location_id)
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Transactions (
        deed_id INTEGER,
        sailor_id INTEGER,
        creditor_id INTEGER,
        final_debt_amount_int INTEGER,
        voyage_id INTEGER,
        FOREIGN KEY (deed_id) REFERENCES Deeds(deed_id),
        FOREIGN KEY (sailor_id) REFERENCES Persons(person_id),
        FOREIGN KEY (creditor_id) REFERENCES Persons(person_id),
        FOREIGN KEY (voyage_id) REFERENCES Voyages(voyage_id)
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Voyages (
        voyage_id INTEGER PRIMARY KEY,
        ship_name TEXT,
        organization TEXT,
        first_deed_date TEXT,
        last_deed_date TEXT
    )
''')

# Dat uit df inlezen in de database
Deeds.to_sql('Deeds', conn, if_exists='append', index=False)
Locations.to_sql('Locations', conn, if_exists='append', index=False)
Persons.to_sql('Persons', conn, if_exists='append', index=False)
Transactions.to_sql('Transactions', conn, if_exists='append', index=False)
Voyages.to_sql('Voyages', conn, if_exists='append', index=False)

# Commit en close
conn.commit()
conn.close()