In [None]:
#Applying the deathcause categories

death_cause_categories = {
    'A00-B99': 'Certain infectious and parasitic diseases',
    'C00-D49': 'Neoplasms',
    'D50-D89': 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    'E00-E89': 'Endocrine, nutritional and metabolic diseases',
    'F01-F99': 'Mental, Behavioral and Neurodevelopmental disorders',
    'G00-G99': 'Diseases of the nervous system',
    'H00-H59': 'Diseases of the eye and adnexa',
    'H60-H95': 'Diseases of the ear and mastoid process',
    'I00-I99': 'Diseases of the circulatory system',
    'J00-J99': 'Diseases of the respiratory system',
    'K00-K95': 'Diseases of the digestive system',
    'L00-L99': 'Diseases of the skin and subcutaneous tissue',
    'M00-M99': 'Diseases of the musculoskeletal system and connective tissue',
    'N00-N99': 'Diseases of the genitourinary system',
    'O00-O9A': 'Pregnancy, childbirth and the puerperium',
    'P00-P96': 'Certain conditions originating in the perinatal period',
    'Q00-Q99': 'Congenital malformations, deformations and chromosomal abnormalities',
    'R00-R99': 'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified',
    'S00-T88': 'Injury, poisoning and certain other consequences of external causes',
    'U00-U85': 'Codes for special purposes',
    'V00-Y99': 'External causes of morbidity',
    'Z00-Z99': 'Factors influencing health status and contact with health services'
}
def map_icd10_to_category(icd10_code):
    if pd.isna(icd10_code):
        return float('nan')

    icd10_code = str(icd10_code)
    for code_range, category in death_cause_categories.items():
        code_intervals = code_range.split(',')
        for code_interval in code_intervals:
            if '-' in code_interval:
                start, end = code_interval.split('-')
                if start <= icd10_code <= end:
                    return category
            else:
                if icd10_code == code_interval:
                    return category
    return "Other"

finaldata["deathcause category"] = finaldata["ICD-10 Code"].apply(map_icd10_to_category)

In [None]:
#Migrant algorithm and coding

migrant_patterns_noamt = ['københavn', 'kjöbenhavn','kjøbenh','kjbhvn','kiøbh','kiøben','købhvn', 'kiöbh', 'kiöeben', 'khavn ',
                     'khvn','kbhv', 'kjøbh','kbh', 'i sognet','heri sognet']
migrant_patterns_all = ['frederiksberg', 'christianshavn', 'kristianshavn', 'sundby', 'sunby', 'valby', 'brønshøj', 'brøndshøj', 'brønshøi', 'brøndshøi', 'utterslev', 'vanløse', 'bispebjerg', 'emdrup',
                         'husum', 'fredrikbg',
                         'fredriksberg'  ]

def is_migrant(value):
    if pd.isna(value):
        return np.nan
    if 'amt' not in value:
        for pattern in migrant_patterns_noamt:
            if re.search(pattern, str(value), flags=re.IGNORECASE):
                return 0
    for pattern in migrant_patterns_all:
        if re.search(pattern, str(value), flags=re.IGNORECASE):
            return 0
    return 1

In [None]:

#Creating the dataset containing information on all citizens living in Copenhagen in 1860 from the 1860 census

census1860 = pd.read_csv(C:/Users/juliu/Data/Kilder2/census1860s.csv)

census1860['migrant'] = census1860['birth_place_cl'].apply(is_migrant)

filter_kobenhavn = census1860['event_district'] == 'københavn'

filter_sokkelund = (census1860['event_district'] == 'sokkelund') & (census1860['event_parish'] == 'staden københavn')

filtered_df2 = census1860[filter_kobenhavn | filter_sokkelund]

df = pd.merge(filtered_df2, finaldata, on=['name_cl', 'birth_place_cl'], how='left')

df  = df.drop_duplicates(subset=['name_cl', 'birth_place_cl'])

In [None]:
#Dataframe to contain the ICD-10 coding

# Lists to hold the causes of death and ICD-10 codes

causes = []
codes = []

# Open the text file
with open('processed_output.txt', 'r') as file:
    # Read each line in the file
    for line in file:
        
            # Find the index of ", ICD-10 code"
            code_index = line.find(", ICD-10 code")

            # Extract the cause of death as the text after "Processed cause: " and before ", ICD-10 code"
            cause = line[len('Processed cause: '):code_index].strip()

            # Extract the ICD-10 code as the text after ", ICD-10 code"
            code = line[code_index+len(', ICD-10 code: '):].strip()

            # Append the cause of death and ICD-10 code to the lists
            causes.append(cause)
            codes.append(code)
        

# Create a DataFrame from the lists
classdf = pd.DataFrame({'Cause of Death': causes, 'ICD-10 Code': codes})

print(classdf)

classdf['ICD-10 Code'] = classdf['ICD-10 Code'].str.rsplit(',').str[-1]

In [None]:
#Merging the ICD-10 coding into the finaldata set

from tqdm import tqdm

# Assuming classdf is the dataframe with two columns
# Assuming finaldata is the existing dataframe

total_iterations = len(finaldata)


with tqdm(total=total_iterations, desc='Merging data') as pbar:
   
    finaldata2 = pd.merge(finaldata, classdf, left_on='deathcauses', right_on='Cause of Death', how='left', validate='m:1', indicator=True)

    # Drop the duplicate column (Cause of Death)
    finaldata2 = finaldata2.drop('Cause of Death', axis=1)

    pbar.update(len(finaldata2) - pbar.n)