In [35]:
from IPython.display import HTML
HTML('''
    <style> body {font-family: "Roboto Condensed Light", "Roboto Condensed";} h2 {padding: 10px 12px; background-color: #DDE6D5; position: static; color: #ffffff; font-size: 40px;} .text_cell_render p { font-size: 15px; } .text_cell_render h1 { font-size: 30px; } h1 {padding: 10px 12px; background-color: #E5C1CD; color: #ffffff; font-size: 40px;} .text_cell_render h3 { padding: 10px 12px; background-color: #E8F3FF; position: static; color: #ffffff; font-size: 20px;} h4:before{ 
    content: "@"; font-family:"Wingdings"; font-style:regular; margin-right: 4px;} .text_cell_render h4 {padding: 8px; font-family: "Roboto Condensed Light"; position: static; font-style: italic; background-color: #FFB800; color: #ffffff; font-size: 18px; text-align: center; border-radius: 5px;}input[type=submit] {background-color: #E64626; border: solid; border-color: #734036; color: white; padding: 8px 16px; text-decoration: none; margin: 4px 2px; cursor: pointer; border-radius: 20px;}</style>
''')

# Original Dataset Cleaning

In [6]:
import pandas as pd
import country_converter as coco
import numpy as np

In [7]:
#load inital dataset
raw = pd.read_csv("life-expectancy.csv")

In [8]:
#define entity categories
continent = ['Americas', 'Africa', 'Asia', 'Europe', 'Latin America and the Caribbean', 'Northern America', 'Oceania']
income_groups = ['High-income countries', 'Upper-middle-income countries', 'Middle-income countries', 'Lower-middle-income countries', 'Low-income countries', 'No income group available']
development = ['More developed regions', 'Small Island Developing States (SIDS)', 'Less developed regions', 'Less developed regions, excluding China', 'Less developed regions, excluding least developed countries', 'Least developed countries', 'Land-locked Developing Countries (LLDC)']

In [9]:
#create new df for continents, income level, development status
continents = raw[raw['Entity'].isin(continent) == True]
income = raw[raw['Entity'].isin(income_groups) == True]
develop = raw[raw['Entity'].isin(development) == True]

In [10]:
#export as csv
continents.to_csv("continents.csv")
income.to_csv("income.csv")
develop.to_csv("development.csv")

In [11]:
#create new df for countries
countries = raw[raw['Entity'].isin(development) == False]
countries = countries[countries['Entity'].isin(income_groups) == False]
countries = countries[countries['Entity'].isin(continent) == False]

In [31]:
#hide error output for aesthetic purpose
%%capture
#add continent column for countries df
converter = coco.CountryConverter()
countries['Continent'] = converter.convert(names=countries['Code'], src="ISO3", to="continent")

In [13]:
#find countries without continent match
a = countries[countries['Code'].isna() == True]
a = a[['Entity', 'Code']]
grouped = a.groupby(by="Entity").sum()
grouped

Unnamed: 0_level_0,Code
Entity,Unnamed: 1_level_1
England and Wales,0
Northern Ireland,0
Scotland,0


In [14]:
#update continent column
countries['Continent'] = countries['Continent'].str.replace('not found', 'Europe')

In [15]:
#export to csv
countries.to_csv("countries.csv")

# Additional Datasets

## Disease Mortality Rates

In [16]:
# load dataset (https://ghdx.healthdata.org/record/ihme-data/gbd-2021-cause-specific-mortality-1990-2021)
disease = pd.read_excel("disease-mortality-rates.XLSX")

In [17]:
#create a subset with only mortality rate per 100,000 people
disease = disease[['location_type', 'location_name', 'cause_name', '2010 (ASMR)', '2019 (ASMR)', '2020 (ASMR)', '2021 (ASMR)']]

#convert columns to numeric
cols = ['2010 (ASMR)', '2019 (ASMR)', '2020 (ASMR)', '2021 (ASMR)']
for i in cols:
    disease[i] = disease.loc[:, i].str.replace(r"\(.*\)","", regex=True)
    disease[i] = pd.to_numeric(disease.loc[:, i])

In [18]:
disease.loc[:, 'change'] = (disease.loc[:, '2021 (ASMR)'] - disease.loc[:, '2010 (ASMR)'])

In [19]:
#create global dataset
globe = disease[disease['location_type'] == 'Global']

#create global grouped dataset by cause
globe_group = globe[['cause_name', '2010 (ASMR)', '2019 (ASMR)', '2020 (ASMR)', '2021 (ASMR)', 'change']].groupby(by='cause_name').mean().reset_index()
globe_group.sort_values(by='change', ascending=False)

Unnamed: 0,cause_name,2010 (ASMR),2019 (ASMR),2020 (ASMR),2021 (ASMR),change
24,COVID-19,0.00,0.00,58.68,93.98,93.98
140,Other COVID-19 pandemic-related outcomes,0.00,0.00,16.71,32.33,32.33
54,Diabetes mellitus type 2,18.45,19.19,19.11,19.02,0.57
44,Conflict and terrorism,0.76,1.51,1.15,1.26,0.50
32,Chronic kidney disease due to diabetes mellitu...,5.27,5.73,5.73,5.72,0.45
...,...,...,...,...,...,...
95,Ischemic stroke,52.44,45.06,44.57,44.18,-8.26
92,Intracerebral hemorrhage,48.45,39.89,39.45,39.08,-9.37
106,Lower respiratory infections,40.93,34.69,30.43,28.66,-12.27
55,Diarrheal diseases,28.79,17.14,16.13,15.44,-13.35


In [20]:
#create africa dataset
a = ['North Africa and Middle East', 'Central Sub-Saharan Africa', 'Eastern Sub-Saharan Africa', 'Southern Sub-Saharan Africa', 'Western Sub-Saharan Africa']
africa = disease[(disease['location_type'] == 'Region') & (disease['location_name'].isin(a) == True)]

#create grouped africa dataset by cause
ac = africa.copy()
ac = ac[['cause_name', '2010 (ASMR)', '2019 (ASMR)', '2020 (ASMR)', '2021 (ASMR)', 'change']]
africa_group = ac.groupby(by='cause_name').mean().reset_index()

In [21]:
#create column of difference between african vs global average in 2021
africa_group['diff'] = (africa_group['2021 (ASMR)'] - globe_group['2021 (ASMR)'])
africa_group = africa_group.sort_values(by='diff', ascending=False)
africa_group

Unnamed: 0,cause_name,2010 (ASMR),2019 (ASMR),2020 (ASMR),2021 (ASMR),change,diff
24,COVID-19,0.000,0.000,165.974,273.010,273.010,179.030
140,Other COVID-19 pandemic-related outcomes,0.000,0.000,49.518,119.190,119.190,86.860
106,Lower respiratory infections,101.996,85.830,77.468,74.618,-27.378,45.958
60,Drug-susceptible tuberculosis,86.234,57.760,56.028,54.930,-31.304,42.340
83,HIV/AIDS resulting in other diseases,89.890,43.336,41.488,39.564,-50.326,33.294
...,...,...,...,...,...,...,...
176,Pancreatic cancer,3.198,3.480,3.520,3.526,0.328,-2.424
205,Stomach cancer,9.080,8.040,7.964,7.854,-1.226,-3.346
43,Colon and rectum cancer,8.802,8.918,8.956,8.920,0.118,-3.480
214,"Tracheal, bronchus, and lung cancer",11.666,11.296,11.352,11.288,-0.378,-12.212


In [22]:
globe_group[globe_group['cause_name'] == 'Physical violence by other means']

Unnamed: 0,cause_name,2010 (ASMR),2019 (ASMR),2020 (ASMR),2021 (ASMR),change
185,Physical violence by other means,2.12,1.82,1.78,1.76,-0.36


In [23]:
#create dataset for sudan
sudan = disease[disease['location_name'] == 'South Sudan']

#create grouped dataset by cause for sudan and column for difference from global average of 2021
sudan_group = sudan[['cause_name', '2010 (ASMR)', '2019 (ASMR)', '2020 (ASMR)', '2021 (ASMR)', 'change']].groupby(by='cause_name').mean().reset_index()
sudan_group['diff'] = (sudan_group['2021 (ASMR)'] - globe_group['2021 (ASMR)'])
sudan_group.sort_values(by='diff', ascending=False).head(10)

Unnamed: 0,cause_name,2010 (ASMR),2019 (ASMR),2020 (ASMR),2021 (ASMR),change,diff
24,COVID-19,0.0,0.0,244.61,264.78,264.78,170.8
55,Diarrheal diseases,182.13,177.22,171.87,166.64,-15.49,151.2
140,Other COVID-19 pandemic-related outcomes,0.0,0.0,79.86,140.94,140.94,108.61
60,Drug-susceptible tuberculosis,101.04,102.31,102.85,103.54,2.5,90.95
106,Lower respiratory infections,116.25,130.16,117.24,116.7,0.45,88.04
107,Malaria,47.05,74.22,69.31,66.45,19.4,55.93
92,Intracerebral hemorrhage,73.05,80.72,81.52,81.62,8.57,42.54
54,Diabetes mellitus type 2,47.63,55.56,56.06,56.26,8.63,37.24
191,Protein-energy malnutrition,33.59,35.99,34.37,32.78,-0.81,30.17
83,HIV/AIDS resulting in other diseases,79.09,41.97,38.0,33.76,-45.33,27.49


## Life Expectancy Decomposition

In [36]:
#load dataset (https://cloud.ihme.washington.edu/s/6w3TkFaQw63Djnd?)
decomp = pd.read_excel("life-expectancy-decomp.XLSX")
decomp

Unnamed: 0,Location Name,Start Year,End Year,Start Year LE,End Year LE,COVID-19,Chronic respiratory diseases,Diabetes and kidney diseases,Digestive diseases,Enteric infections,...,Nutritional deficiencies,Other COVID-19 pandemic-related outcomes,Other Communicable and maternal disorders,Other NCD,Self-harm and interpersonal violence (except of war and terrorism),Stroke,Transport injuries,Tuberculosis,Unintentional injuries (except of natural disaster),Natural disaster; war and terrorism
0,Global,1990,2021,65.5,71.7,-1.6,0.46,-0.05,0.30,1.13,...,0.22,-0.6,0.62,0.52,0.19,0.78,0.21,0.52,0.41,0.05
1,Global,1990,2000,65.5,67.2,0.0,0.12,-0.02,0.07,0.38,...,0.04,0.0,0.18,0.15,0.00,0.16,0.04,0.13,0.11,0.03
2,Global,2000,2010,67.2,70.5,0.0,0.20,0.00,0.10,0.36,...,0.11,0.0,0.22,0.18,0.11,0.33,0.05,0.22,0.14,-0.07
3,Global,2010,2019,70.5,73.3,0.0,0.11,-0.03,0.11,0.34,...,0.06,0.0,0.15,0.13,0.07,0.26,0.10,0.15,0.12,0.08
4,Global,2019,2021,73.3,71.7,-1.6,0.02,0.00,0.02,0.05,...,0.01,-0.6,0.07,0.05,0.01,0.03,0.02,0.02,0.03,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2720,Troms og Finnmark,1990,2021,75.5,81.7,0.0,-0.18,0.00,0.11,-0.04,...,-0.03,0.0,0.01,0.44,0.08,0.97,0.44,0.02,0.14,0.03
2721,Troms og Finnmark,1990,2000,75.5,77.7,0.0,-0.11,-0.04,0.01,-0.01,...,0.00,0.0,0.01,0.08,0.06,0.26,0.16,0.01,0.07,0.01
2722,Troms og Finnmark,2000,2010,77.7,80.6,0.0,-0.02,0.01,0.09,-0.05,...,0.00,0.0,0.00,0.29,0.05,0.44,0.17,0.01,0.05,-0.02
2723,Troms og Finnmark,2010,2019,80.6,82.4,0.0,0.01,0.05,0.03,0.02,...,-0.03,0.0,-0.01,0.16,-0.03,0.30,0.12,0.00,0.04,0.03


## Death in Armed Conflicts

In [25]:
#load df (Uppsala Conflict Data Program (2023); Natural Earth (2022) – processed by Our World in Data)
conflicts = pd.read_csv("deaths-in-armed-conflicts-by-country.csv")

In [32]:
#hide error output for aesthetic purpose
%%capture
#add continent column to df
converter = coco.CountryConverter()
conflicts['Continent'] = converter.convert(names=conflicts['Code'], src="ISO3", to="continent")

In [None]:
#check for missed countries
conflicts['Continent'].unique()

In [27]:
#remove non-states/former states from df
conflicts = conflicts[conflicts['Continent'] != 'not found']

In [28]:
#create df for african conflicts
africa = conflicts[conflicts['Continent'] == 'Africa']

In [29]:
#create df for case studies
case = conflicts[(conflicts['Entity'] == 'TODO') | (conflicts['Entity'] == 'South Sudan')]
case

Unnamed: 0,Entity,Code,Year,Deaths in ongoing conflicts in a country (best estimate) - Conflict type: all,Continent
5746,South Sudan,SSD,1989,3907,Africa
5747,South Sudan,SSD,1990,4272,Africa
5748,South Sudan,SSD,1991,4206,Africa
5749,South Sudan,SSD,1992,3346,Africa
5750,South Sudan,SSD,1993,5877,Africa
5751,South Sudan,SSD,1994,491,Africa
5752,South Sudan,SSD,1995,1162,Africa
5753,South Sudan,SSD,1996,2374,Africa
5754,South Sudan,SSD,1997,3574,Africa
5755,South Sudan,SSD,1998,3887,Africa


# Healthcare Expenditure
World Health Organization - Global Health Observatory (2024) – processed by Our World in Data

# Potential Topics

## Specific Disease
*Can we explain the reason behind Africa's lagging life expectancy by some specific diseases?*
- *If Africa is being left behind on life expectancy growth, are there any specific diseases that can be accountable? Are they preventable - if it's not affecting other continents, they probably are.*
- *Has the Gates Foundation done much? (lol)*
- *Why did an extra 55 years to see the improved life expectancy outcomes following the discoveries of Louis Pasteur? Is a similar thing happening now?*

## War
*Can we explain a subset of countries life expectancy according to wars or political events like famines, etc and can we explain why the life expectancy will bounce back after these events? Additionally, will these political events have a longer term effect on population (e.g. maybe a country's life expectancy plummets due to a genocide, then the average life expectancy does bounce back but the population of the country will decrease. So the life expectancy may be high, but it is because it only includes the survivors.*
- *Does war have a significant effect on life expectancy from birth? Are there any countries that don't follow this upward trend?*

## How are Regions Left Behind?
- *We know that life expectancy has been going up for a long time, and the trend is found nearly everywhere. But, why is that Africa is still lagging way behind all other continents?*
- *Well, we can see that they had a much later start than every other continent - why?* **Records weren't really kept until 1950 when decolonisation really started. Colonialists weren't particularly interested in setting up medical infrastructure for the Indigenous people, and so they didn't get to feel the benefits of rapidly improving modern medicine until they started creating their own medical infrastructure.**
- *So what are the current effects? What diseases / causes of death are the most prevalent in Africa and why are they able to thrive where they have been eradicated in the rest of the world?* **TODO: look at life expectancy decomp + analyse the diseases that Africa is above average in**
- *Then they had the 'lost decade' - why did they fail to make it?*
- *We can also see, the biggest increase in deaths over time in Africa aside from Covid-19 (which they suffered the worst out of all continents) from **terrorism and conflict*** - *(this is led by Ethiopia/Somalia and the ethnic groups within)*
- *So we might want to think about how terrorism and conflict impacts life expectancy historically*
- *We might also want to think about how it affects overall population growth, and how that matches up with life expectancy*

## Storytelling

1. There is a trend of increasing life expectancy over time, fueled most clearly by the development of germ theory in 1870.
2. However, Africa does not start increasing until much later on - why?
    * Largely colonised, and colonial forces did not create much medical infrastructure for the Indigenous people, so the benefits of modern medicine were not shared. In the 50's, when decolonisation begins, then they start to develop these critical infrastructures.
    * So why are they still not meeting the global standard? Let's look into different diseases which are impacting the continent the most, comparative to other countries as well.
3. But disease is not the whole story, looking on a country level we can see the impact of conflict and terrorism in the region - focus on Sudan & Palestine
    * Can we find a correlation between deaths due to armed conflict and life expectancy?
    * Highlight how excessive conflict is in Africa comparative to other regions

**Pre 2000s / 90s**
1. There is a trend of increasing life expectancy over time, fueled most clearly by the development of germ theory in 1870.
2. However, Africa does not start increasing until much later on - why?
    a. Largely colonised, and colonial forces did not create much medical infrastructure for the Indigenous people, so the benefits of modern medicine were not shared. In the 50's, when decolonisation begins, then they start to develop these critical infrastructures.

**2000s Onward**

3. So why are they still not meeting the global standard? Let's look into different diseases which are impacting the continent the most, comparative to other countries as well.
4. But disease is not the whole story, looking on a country level we can see the impact of conflict and terrorism in the region - focus on Sudan & Palestine
    * Can we find a correlation between deaths due to armed conflict and life expectancy?
    * Highlight how excessive conflict is in Africa comparative to other regions

## TODO

### Analysis
- Create line graph of life expectancy by continent over time, with annotations on 1870 to mark the start of the upward trend and on 1935 to show when Africa begins to rise.
- Find a source (best would be a dataset) explaining medical infrastructure in colonial times / when health outcomes began to improve
- Disease analysis: point to specific diseases that are holding Africa back and their development over time (HIV, COVID-19, etc)
- Year on year change in life expectancy with year on year combat deaths - looking for correlation (time series analysis research)
- Case study on South Sudan 
- Map visualisation of combat deaths by continent (or country?)

### Report
- Convert to RMD
- Data provenance