In [2]:
import os 
import pandas as pd
%load_ext autoreload
%autoreload 2
from demografi import mergefunc
from demografi import cleanfunc

In [18]:

#File paths to standardized datasets - manual addition of more datasets from project updates is possible,
#but must be accompanied by an updated name array that is used to name additional variables from each dataset, e.g. "event_parish_1787s"
#Example: Addition of source 11: PR Burials can be done by adding the corresponding .csv file to the end of the 'file_paths' list and
#copying this code snippet at the end of this code cell: name_array.append('NAME_YOU_CHOOSE')

#OBS: datasets in 'file_paths' must be in correct numerical order according to the link-lives source enumeration. See: Readme.me


file_paths = ['C:/Users/juliu/Data/Kilder2/census1787s.csv',
 'C:/Users/juliu/Data/Kilder2/census1801s.csv',
 'C:/Users/juliu/Data/Kilder2/census1834s.csv',
 'C:/Users/juliu/Data/Kilder2/census1840s.csv',
 'C:/Users/juliu/Data/Kilder2/census1845s.csv',
 'C:/Users/juliu/Data/Kilder2/census1850s.csv',
 'C:/Users/juliu/Data/Kilder2/census1860s.csv',
 'C:/Users/juliu/Data/Kilder2/census1880s.csv',
 'C:/Users/juliu/Data/Kilder2/census1885s.csv',
 'C:/Users/juliu/Data/Kilder2/census1901s.csv']

CBPt = 'C:/Users/juliu/Data/Kilder1/CBPt.csv' #Copenhagen Burial Protocols. Standardized version can be used instead
lifecourses = 'C:/Users/juliu/Data/life_courses.csv' #The life-courses set, containing life-course id's

name_array = [file_path[34:39] for file_path in file_paths]


In [19]:
#Checking the paths
def assert_paths_exist(paths): 
    if isinstance(paths, str):  
        paths = [paths]  
    for path in paths:
        assert os.path.exists(path), f"Path does not exist: {path}"

assert_paths_exist(file_paths)
assert_paths_exist(CBPt)
assert_paths_exist(lifecourses)

In [None]:
#Cleaning iterator using the cleanfunc function from the .py file
census_datasets = []
for k, v in zip(file_paths, name_array):
    kilde = cleanfunc(k, v)
    census_datasets.append(kilde)

In [None]:
#Identifying and separating the Copenhagen Burial Protocol observations in the life-courses dataset

lifecourses = pd.read_csv(lifecourses)

#Splitting the data sources and observation i.d.'s into separate columns in order to operate on them
lifecourses2 = lifecourses

lifecourses2[['source0', 'source1', 'source2', 'source3', 'source4', 'source5', 'source6',
               'source7', 'source8', 'source9', 'source10', 'source11', 'source12']]  = lifecourses2.source_ids.str.split(',', expand=True).astype('Int64')

lifecourses2[['pa_id0', 'pa_id1', 'pa_id2', 'pa_id3', 'pa_id4', 'pa_id5', 'pa_id6', 'pa_id7',
              'pa_id8', 'pa_id9', 'pa_id10', 'pa_id11', 'pa_id12']] = lifecourses2.pa_ids.str.split(',', expand=True).astype('str')


#Identifying and separating out the observations present in the Copenhagen Burial Protocols
I = lifecourses2.source_ids.str.contains('10')
lifecourses2.loc[I]
lifecourses2 = lifecourses2.loc[I]



In [None]:
#Merging the information in the (currently 9) standardized source datasets that contain CBP observations
#OBS: datasets must be in correct numerical order according to the link-lives source enumeration. See: Readme.me
merged_datasets = []
for i, census in enumerate(census_datasets):
    merged_dataset = mergefunc(lifecourses2, census, i)
    merged_datasets.append(merged_dataset)

#Merging the source datasets into 1
concatenated_dataset = pd.concat(merged_datasets)

print(concatenated_dataset.shape)


In [None]:
#Cleaning the merged dataset

#Ensuring that no information is lost
aggregation_functions = {}
for column in concatenated_dataset.columns:
    aggregation_functions[column] = 'first' # Keep the first value of each column
    
concatenated_dataset_grouped = concatenated_dataset.groupby(['life_course_id']).aggregate(aggregation_functions)
 
#Removing unneeded variables
cleanedcensus0 = concatenated_dataset_grouped.drop(['pa_ids', 'link_ids', 'source0', 'source1', 'source2', 'source3', 'source4', 
'source5', 'source6', 'source7', 'source8', 'source9', 'source10', 'source11', 'source12', 'pa_id0', 'pa_id1', 'pa_id2', 'pa_id3', 'pa_id4',
'pa_id5', 'pa_id6', 'pa_id7', 'pa_id8', 'pa_id9', 'pa_id10', 'pa_id11', 'pa_id12', 'pa_id'], axis=1)
cleanedcensus0 = cleanedcensus0.dropna(axis=1, how='all')

In [None]:
#Initializing, cleaning, and adding transcribed version of the Copenhagen Burial Protocols

CBPt = pd.read_csv(CBPt)

CBPt['pa_id'] = CBPt['pa_id'].astype(str)

CBPtmerg = mergefunc(lifecourses2, CBPt, 10)

CBPtmerg = CBPtmerg.drop(['pa_ids', 'link_ids', 'source_ids', 'source0', 'source1', 'source2', 'source3', 'source4', 
'source5', 'source6', 'source7', 'source8', 'source9', 'source10', 'source11', 'source12', 'pa_id0', 'pa_id1', 'pa_id2', 'pa_id3', 'pa_id4',
'pa_id5', 'pa_id6', 'pa_id7', 'pa_id8', 'pa_id9', 'pa_id10', 'pa_id11', 'pa_id12', 'pa_id', 'id'], axis=1)

cleanedcensus1 = cleanedcensus0.set_index('life_course_id')
CBPtmerg1 = CBPtmerg.set_index('life_course_id')

finaldata = cleanedcensus1.merge(CBPtmerg1, on='life_course_id', how='left')

finaldata = finaldata.rename(columns={'n_sources_x':'n_sources', 'sex_x':'sex'})

finaldata = finaldata.drop(['n_sources_y', 'sex_y'], axis=1)

finaldata = finaldata.drop(['firstnames', 'lastname', 'birthname', 'number', 'dateOfBirth', 'yearOfBirth', 'street_unique'], axis=1)

In [None]:
finaldata = finaldata.drop_duplicates(subset=['name_cl', 'chapel','dateOfDeath']) # Dropping duplicates wrongly identified as unique by life_course_id

In [None]:
#Creating .csv file in local directory
finaldata.to_csv('finaldata.csv', index=False)