In [2]:
# Import necessary modules
import pandas as pd
import numpy as np
import difflib

def within_range(checknumber, range_start, range_end):
    """" 
    Function to check if a BGB booking year falls within the timespan
    in DAS referred to in the introduction to this script
    
    """
    if (checknumber >= range_start) & (checknumber <= (range_end + 20)):
        return True
    else:
        return False

In [3]:
# First, load the entire Zeemonsterrollen database
gzm = pd.ExcelFile('zeemonsterrollen.xlsx')

# Load the match list between BGB and DAS
bgbmatch = pd.ExcelFile('Matching_results.xlsx')

In [4]:
# Parse the Excel sheets we will be using
zeemons = gzm.parse('Database GZM (MvR 2014)')
zeemons = zeemons.set_index('ID')

bgbnomatch = bgbmatch.parse('No match')

In [5]:
# Loop over all entries without a corresponding DAS Ship ID

gzm_ships = []

for entry in zeemons[zeemons['DAS SHIP ID'].isna()].index:
    jaar = zeemons.loc[entry, 'JAAR']
    shipname = zeemons.loc[entry, 'NAAM SCHIP (GESTANDAARDISEERD)']
    this_entry = [jaar, shipname]
    gzm_ships.append(this_entry)


fulldata = pd.DataFrame.from_records(gzm_ships, columns=['Year', 'Shipname'])

In [6]:
fulldata

Unnamed: 0,Year,Shipname
0,1691,SINT NICOLAAS
1,1691,GRIJPVOGEL
2,1691,BATAVIA
3,1691,STANDVASTIGHEID
4,1691,WIJK OP ZEE
...,...,...
1707,1790,TRITON
1708,1790,AFRIKAAN
1709,1791,ORANJEBOOM
1710,1791,CORNELIA ADRIANA


In [7]:
summary = fulldata.groupby('Shipname')['Year'].agg({'min', 'max'}).reset_index()
summary['Shipname'] = summary['Shipname'].str.lower()
summary

Unnamed: 0,Shipname,min,max
0,aardenhout,1696,1706
1,achilles,1714,1719
2,adam,1712,1719
3,adam en eva,1718,1718
4,adriana,1736,1754
...,...,...,...
398,zwaardvis,1697,1708
399,zwarte arend,1696,1696
400,zwerver,1732,1741
401,[eiland edam],1703,1703


In [8]:
gzm_shiplist = list(summary['Shipname'].unique())
gzm_shiplist.remove('[onbekend]')

In [9]:
bgbnomatch['BGB ship name'] = bgbnomatch['BGB ship name'].str.lower()

In [10]:
bgbnomatch

Unnamed: 0.1,Unnamed: 0,BGB Shipvoyage ID,BGB Voyage ID,BGB ship ID,BGB ship name,BGB Booking year
0,0,139,99448,3150,kleine pallas,1790
1,1,145,99454,3153,langmoedigheid,1790
2,2,149,99458,3154,vredelief,1790
3,3,151,99460,3155,wilhelmina,1790
4,4,153,99462,3156,jonge wilhelmina,1790
...,...,...,...,...,...,...
3983,3983,20338,116787,3412,onbekend,1731
3984,3984,20339,117012,3412,onbekend,1723
3985,3985,20340,117815,3412,onbekend,1729
3986,3986,20341,117832,3412,onbekend,1729


In [11]:
counter = 0

for entry in bgbnomatch.index:
    bgbyear = int(bgbnomatch.loc[entry, 'BGB Booking year'])
    bgb_shipname = bgbnomatch.loc[entry, 'BGB ship name']
    checking = difflib.get_close_matches(bgb_shipname, gzm_shiplist, n=3, cutoff=0.85)
    
    if checking:
        try:
            minyear_gzb = int(summary.loc[summary['Shipname'] == bgb_shipname]['min'])
        except:
            minyear_gzb = 0
        
        try:
            maxyear_gzb = int(summary.loc[summary['Shipname'] == bgb_shipname]['max'])
        except:
            maxyear_gzb = 0

        
        if within_range(bgbyear, minyear_gzb, maxyear_gzb):
            counter += 1
            print(counter)
            print(bgb_shipname)
            print([checking[0]])
            print(bgbyear)
            print(minyear_gzb, maxyear_gzb)
            print('-------------')

1
kleine pallas
['kleine pallas']
1790
1779 1790
-------------
2
cornelia adriana
['cornelia adriana']
1780
1778 1791
-------------
3
cornelia adriana
['cornelia adriana']
1780
1778 1791
-------------
4
kleine pallas
['kleine pallas']
1780
1779 1790
-------------
5
kleine pallas
['kleine pallas']
1790
1779 1790
-------------
6
rembang
['rembang']
1790
1714 1789
-------------
7
kleine pallas
['kleine pallas']
1787
1779 1790
-------------
8
kleine pallas
['kleine pallas']
1787
1779 1790
-------------
9
kleine pallas
['kleine pallas']
1787
1779 1790
-------------
10
charlotta christina
['charlotte christina']
0
0 0
-------------
11
cornelia adriana
['cornelia adriana']
1780
1778 1791
-------------
12
kleine pallas
['kleine pallas']
1780
1779 1790
-------------
13
rembang
['rembang']
1787
1714 1789
-------------
14
cornelia adriana
['cornelia adriana']
1787
1778 1791
-------------
15
kleine pallas
['kleine pallas']
1787
1779 1790
-------------
16
kleine pallas
['kleine pallas']
1787
1779 1

133
rijder
['rijder']
1761
1758 1760
-------------
134
schildpad
['schildpad']
1761
1760 1760
-------------
135
rijder
['rijder']
1760
1758 1760
-------------
136
schildpad
['schildpad']
1760
1760 1760
-------------
137
anthonia dorothea
['anthonia dorothea']
1760
1745 1746
-------------
138
rijder
['rijder']
1761
1758 1760
-------------
139
wereld
['wereld']
1760
1751 1758
-------------
140
wereld
['wereld']
1757
1751 1758
-------------
141
schildpad
['schildpad']
1761
1760 1760
-------------
142
postiljon
['postiljon']
1761
1755 1782
-------------
143
aurora
['aurora']
1757
1756 1756
-------------
144
schildpad
['schildpad']
1761
1760 1760
-------------
145
rijder
['rijder']
1761
1758 1760
-------------
146
anthonia dorothea
['anthonia dorothea']
1761
1745 1746
-------------
147
wereld
['wereld']
1757
1751 1758
-------------
148
aletta adriana
['aletta adriana']
1761
1754 1754
-------------
149
anthonia dorothea
['anthonia dorothea']
1761
1745 1746
-------------
150
rijder
['rijder']

1703
1698 1755
-------------
308
pepertuin
['pepertuin']
1703
1693 1746
-------------
309
pepertuin
['pepertuin']
1703
1693 1746
-------------
310
onbeschaamdheid
['onbeschaamdheid']
1739
1733 1733
-------------
311
geertruida maria
['geertruida maria']
1738
1731 1752
-------------
312
sousang
['sousang']
1738
1712 1723
-------------
313
bouro
['bouro']
1738
1712 1719
-------------
314
jacob willem
['jacob willem']
1738
1736 1741
-------------
315
snip
['snip']
1732
1729 1736
-------------
316
jacob willem
['jacob willem']
1739
1736 1741
-------------
317
jonge dirk
['jonge dirk']
1739
1739 1739
-------------
318
lamatjang
['lamatjang']
1739
1739 1739
-------------
319
verlangen
['verlangen']
1739
1735 1736
-------------
320
sousang
['sousang']
1735
1712 1723
-------------
321
goudmijn
['goudmijn']
1735
1734 1734
-------------
322
snip
['snip']
1732
1729 1736
-------------
323
beschermer
['beschermer']
1732
1709 1734
-------------
324
beschermer
['beschermer']
1732
1709 1734
----------

461
waijer
['waijer']
1704
1697 1714
-------------
462
doradus
['doradus']
1704
1696 1697
-------------
463
herderin
['herderin']
1704
1698 1755
-------------
464
beschermer
['beschermer']
1726
1709 1734
-------------
465
beschermer
['beschermer']
1726
1709 1734
-------------
466
tienhoven
['tienhoven']
1726
1723 1729
-------------
467
tienhoven
['tienhoven']
1726
1723 1729
-------------
468
haai
['haai']
1726
1722 1722
-------------
469
neira
['neira']
1703
1701 1711
-------------
470
aardenhout
['aardenhout']
1703
1696 1706
-------------
471
aardenhout
['aardenhout']
1703
1696 1706
-------------
472
andromeda
['andromeda']
1707
1694 1710
-------------
473
pepertuin
['pepertuin']
1707
1693 1746
-------------
474
pepertuin
['pepertuin']
1707
1693 1746
-------------
475
neira
['neira']
1707
1701 1711
-------------
476
cheribon
['cheribon']
1707
1696 1717
-------------
477
waijer
['waijer']
1707
1697 1714
-------------
478
andromeda
['andromeda']
1707
1694 1710
-------------
479
andromed

623
colombo
['colombo']
1723
1717 1762
-------------
624
colombo
['colombo']
1723
1717 1762
-------------
625
cananoor
['cananoor']
1723
1719 1723
-------------
626
dijkveld
['dijkveld']
1723
1705 1705
-------------
627
tienhoven
['tienhoven']
1723
1723 1729
-------------
628
uitgang
['uitgang']
1723
1720 1721
-------------
629
bouro
['bouro']
1723
1712 1719
-------------
630
bouro
['bouro']
1723
1712 1719
-------------
631
uitgang
['uitgang']
1723
1720 1721
-------------
632
langerak
['langerak']
1723
1713 1735
-------------
633
colombo
['colombo']
1729
1717 1762
-------------
634
snip
['snip']
1729
1729 1736
-------------
635
langerak
['langerak']
1729
1713 1735
-------------
636
olijftak
['olijftak']
1729
1728 1729
-------------
637
bergwerker
['bergwerker']
1729
1728 1729
-------------
638
beschermer
['beschermer']
1729
1709 1734
-------------
639
beschermer
['beschermer']
1729
1709 1734
-------------
640
beschermer
['beschermer']
1729
1709 1734
-------------
641
beschermer
['besch

In [12]:
gzm_shiplist

['aardenhout',
 'achilles',
 'adam',
 'adam en eva',
 'adriana',
 'afrika',
 'afrikaan',
 'agnieta',
 'aletta adriana',
 'allerlande',
 'ameij',
 'amstelstroom',
 'andromeda',
 'anna',
 'anthonia',
 'anthonia dorothea',
 'appelboom',
 'arend',
 'aurora',
 'awers',
 'bagger',
 'bagger; platvis; steenbrasem',
 'bakker',
 'baloebaroe',
 'banda',
 'banka',
 'bantam',
 'barcas',
 'baros',
 'basra',
 'bastaard ',
 'batavia',
 'batavier',
 'beieren',
 'bennebroek',
 'bergwerker',
 'beschermer',
 'beverwaart',
 'beverwijk',
 'biliton',
 'blauwe berg',
 'blik',
 'bliksem',
 'boekenrode',
 'bombardier',
 'bombaria',
 'bonij',
 'boot',
 'bouro',
 'brak',
 'brandenburg',
 'breedte',
 'bruinvis',
 'buis',
 'burg',
 'cacop',
 'cacop; klipvis; witvis',
 'cadirij',
 'calpetij',
 'cananoor',
 'casuaris',
 'catharina',
 'ceilon',
 'ceram',
 'charlotte christina',
 'cheribon',
 'cicero',
 'cochin',
 'colombo',
 'constantia',
 'cornelia',
 'cornelia adriana',
 'cornelia hillegonda',
 'cornelia jacoba',
 '

In [22]:
summary

Unnamed: 0,Shipname,min,max
0,aardenhout,1696,1706
1,achilles,1714,1719
2,adam,1712,1719
3,adam en eva,1718,1718
4,adriana,1736,1754
...,...,...,...
398,zwaardvis,1697,1708
399,zwarte arend,1696,1696
400,zwerver,1732,1741
401,[eiland edam],1703,1703


In [26]:
summary[((summary['max'] - summary['min']) > 40)]

Unnamed: 0,Shipname,min,max
17,arend,1724,1783
61,catharina,1742,1786
68,colombo,1717,1762
69,constantia,1727,1791
86,draak,1696,1769
132,haas,1697,1753
133,haasje,1699,1740
142,herderin,1698,1755
150,hoop,1699,1790
172,johanna,1715,1782


In [27]:
summary[((summary['max'] - summary['min']) < 40)]

Unnamed: 0,Shipname,min,max
0,aardenhout,1696,1706
1,achilles,1714,1719
2,adam,1712,1719
3,adam en eva,1718,1718
4,adriana,1736,1754
...,...,...,...
397,zuster,1736,1736
398,zwaardvis,1697,1708
399,zwarte arend,1696,1696
400,zwerver,1732,1741


In [34]:
GLOB_ships = summary[((summary['max'] - summary['min']) < 40)]

In [35]:
GLOB_ships

Unnamed: 0,Shipname,min,max
0,aardenhout,1696,1706
1,achilles,1714,1719
2,adam,1712,1719
3,adam en eva,1718,1718
4,adriana,1736,1754
...,...,...,...
397,zuster,1736,1736
398,zwaardvis,1697,1708
399,zwarte arend,1696,1696
400,zwerver,1732,1741


In [36]:
GLOB_ships.insert(0, 'GloB_shipID', range(1857, 1857 + len(GLOB_ships)))

In [37]:
GLOB_ships

Unnamed: 0,GloB_shipID,Shipname,min,max
0,1857,aardenhout,1696,1706
1,1858,achilles,1714,1719
2,1859,adam,1712,1719
3,1860,adam en eva,1718,1718
4,1861,adriana,1736,1754
...,...,...,...,...
397,2236,zuster,1736,1736
398,2237,zwaardvis,1697,1708
399,2238,zwarte arend,1696,1696
400,2239,zwerver,1732,1741


In [39]:
GLOB_ships['GloB_shipID'] = 'GLOB_ship' + GLOB_ships['GloB_shipID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GLOB_ships['GloB_shipID'] = 'GLOB_ship' + GLOB_ships['GloB_shipID'].astype(str)


In [40]:
GLOB_ships

Unnamed: 0,GloB_shipID,Shipname,min,max
0,GLOB_ship1857,aardenhout,1696,1706
1,GLOB_ship1858,achilles,1714,1719
2,GLOB_ship1859,adam,1712,1719
3,GLOB_ship1860,adam en eva,1718,1718
4,GLOB_ship1861,adriana,1736,1754
...,...,...,...,...
397,GLOB_ship2236,zuster,1736,1736
398,GLOB_ship2237,zwaardvis,1697,1708
399,GLOB_ship2238,zwarte arend,1696,1696
400,GLOB_ship2239,zwerver,1732,1741


In [52]:
GLOB_ships.to_excel('Glob_ships.xlsx')

In [45]:
das = pd.ExcelFile('das.xlsx')
das_ships = das.parse('shipNameVariant')
das_ships = das_ships.set_index('shipNameVariantID')
das_voyages = das.parse('das_voyage')
das_voyages = das_voyages.set_index('voyId')

In [46]:
das_ships

Unnamed: 0_level_0,shipID,shipNameVariant,shipNameVariantRemark
shipNameVariantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DAS_snv0001,DAS_ship0001,'s Heer Arendskerke,
DAS_snv0002,DAS_ship0002,'s Lands Welvaren,
DAS_snv0003,DAS_ship0003,'s-Graveland,
DAS_snv0004,DAS_ship0004,'s-Graveland,
DAS_snv0005,DAS_ship0005,'s-Gravenhage,
...,...,...,...
DAS_snv1893,DAS_ship1852,Schaapherder,
DAS_snv1894,DAS_ship1853,Senhor De Bonfim E Sancta Maria,
DAS_snv1895,DAS_ship1854,Toevalligheid,
DAS_snv1896,DAS_ship1855,Batavier,


In [47]:
das_voyages

Unnamed: 0_level_0,url,voyNumberDAS,heenreis,terugreis,shipID,shipName,voyMasterID,voyMasterRemark,voyChamberID,voyDepartureEDTF,...,voyCapeDepartureEDTF_remark,voyArrivalDateEDTF,voyArrivalDateEDTF_remark,voyArrivalPlaceID,voyInvoiceValue,voyChamber2ID,voyParticulars,voyCorrespondingNumber,voyRGPDeel,voymaster_VOCOPVid
voyId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91055,http://resources.huygens.knaw.nl/das/detailVoy...,0001.1,1,0,DAS_ship0054,DAS_snv0056,das_mas2328,,,1595-04-02,...,,1596-06-06,,DAS_pl024,,,Equipped (0001-0004) by the Compagnie van Verr...,,166,
91056,http://resources.huygens.knaw.nl/das/detailVoy...,0002.1,1,0,DAS_ship0337,DAS_snv0348,das_mas1789,,,1595-04-02,...,,1596-06-06,,DAS_pl024,,,Mau became master of the HOLLANDIA on 26-10-15...,5001,166,
91057,http://resources.huygens.knaw.nl/das/detailVoy...,0003.1,1,0,DAS_ship0624,DAS_snv0643,das_mas1589,,,1595-04-02,...,,1596-06-06,,DAS_pl024,,,Jan Dignumsz. died on 29-05-1595 and Mau was h...,5002,166,
91058,http://resources.huygens.knaw.nl/das/detailVoy...,0004.1,1,0,DAS_ship0953,DAS_snv0981,das_mas1868,,,1595-04-02,...,,1596-06-06,,DAS_pl024,,,Jan Jansz. died on 25-12-1596 and Hendrik Jans...,5003,166,
91059,http://resources.huygens.knaw.nl/das/detailVoy...,0005.1,1,0,DAS_ship0857,DAS_snv0882,das_mas2641,,,1598-03-25,...,,1599-03,,DAS_pl006,,,Fleet (0005-0007) equipped by the Compagnie Te...,5010,166,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99180,http://resources.huygens.knaw.nl/das/detailVoy...,8397.1,0,1,DAS_ship1809,DAS_snv1850,das_mas2139,,DAS_cham006,1795,...,,1795-09-18,,DAS_pl068,365.373,DAS_cham006,,4772,167,
99181,http://resources.huygens.knaw.nl/das/detailVoy...,8398.2,0,1,DAS_ship0967,DAS_snv0995,das_mas1999,,DAS_cham001,1795,...,,,,,,,Date of arrival at the Cape unknown. On 18-05-...,4674,167,
99182,http://resources.huygens.knaw.nl/das/detailVoy...,8399.1,0,1,DAS_ship0739,DAS_snv0758,das_mas1560,,DAS_cham006,1795,...,,,,,50.232,DAS_cham006,Almost all data concerning this homeward-bound...,4774,167,
99183,http://resources.huygens.knaw.nl/das/detailVoy...,8400.1,0,1,DAS_ship0915,DAS_snv0942,das_mas2665,,DAS_cham001,1795,...,,,,,85.711,DAS_cham001,Almost all data concerning this homeward-bound...,4777,167,


In [48]:
# Now I want a list of individual ships from DAS and the years between which they were employed
# Converting the date to datetime is not possible, since many voyages took place before 1677 (out of bounds)
# This means we lose vectorization advantages anyway, so I'll generate a dataframe using a Python loop 

# Create an empty list to hold the data on ships and dates
das_ship_dates = []

# Populate the list with data from DAS
for voyage in das_voyages.index:
    current_ship_id = das_voyages.loc[voyage, 'shipID']
    current_ship_name_id = das_voyages.loc[voyage, 'shipName']
    current_ship_departure = das_voyages.loc[voyage, 'voyDepartureEDTF']
    current_ship_arrival = das_voyages.loc[voyage, 'voyArrivalDateEDTF']
    current_ship_name = das_ships.loc[current_ship_name_id, 'shipNameVariant']
    
    # Convert dates to 4 digits (datetime not possible w/o workarounds)
    current_ship_departure = str(current_ship_departure)
    current_ship_departure = current_ship_departure[:4]
    current_ship_arrival = str(current_ship_arrival)
    current_ship_arrival = current_ship_arrival[:4]

    # Construct a list with data on this voyage
    this_voyage = (voyage, current_ship_name, current_ship_id, current_ship_name_id, current_ship_departure, current_ship_arrival)
    
    # Append that list to the aforementioned list (of lists)
    das_ship_dates.append(this_voyage)
    
# Create a Pandas dataframe from the list of lists
fulldata = pd.DataFrame.from_records(das_ship_dates, columns=['DasID', 'Shipname', 'DasShipID', 'DasShipNameVariant', 'Startyear', 'Endyear'])
fulldata = fulldata.set_index('DasID')

# Convert the yearcolumns to numeric values (they include some messed up data)
fulldata['Startyear'] = pd.to_numeric(fulldata['Startyear'], errors='coerce')
fulldata['Endyear'] = pd.to_numeric(fulldata['Endyear'], errors='coerce')

# Now we can drop NaNs (error coercion made these NaN)
# And subsequently convert from float to int (due to messed up data, to_numeric couldn't do this)

fulldata['Startyear'] = fulldata['Startyear'].astype(pd.Int32Dtype())
fulldata['Endyear'] = fulldata['Endyear'].astype(pd.Int32Dtype())

In [49]:
# We now have all the data to make a dataframe with: DAS IDs, shipnames, first year the ship was active,
# and last year the ship was active (on intercontinental voyages)
# Caveat: NaN-values are excluded
summarydas = fulldata.groupby(['DasShipNameVariant']).agg({'Shipname':'last', 'DasShipID':'last', 'Startyear':'min', 'Endyear':'max'})
summarydas

Unnamed: 0_level_0,Shipname,DasShipID,Startyear,Endyear
DasShipNameVariant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DAS_snv0001,'s Heer Arendskerke,DAS_ship0001,1725,1742
DAS_snv0002,'s Lands Welvaren,DAS_ship0002,1763,1773
DAS_snv0003,'s-Graveland,DAS_ship0003,1659,1660
DAS_snv0004,'s-Graveland,DAS_ship0004,1723,1726
DAS_snv0005,'s-Gravenhage,DAS_ship0005,1628,1635
...,...,...,...,...
DAS_snv1893,Schaapherder,DAS_ship1852,1690,1693
DAS_snv1894,Senhor De Bonfim E Sancta Maria,DAS_ship1853,1782,
DAS_snv1895,Toevalligheid,DAS_ship1854,1745,1745
DAS_snv1896,Batavier,DAS_ship1855,1736,1758


In [51]:
summarydas.to_excel('summarydas.xlsx')

In [13]:
len(gzm_shiplist)

402