In [36]:
import pandas as pd

# Population Data

In [37]:
# 1. rename columns
pop_df = pd.read_csv('../data/population.csv')
rename_dict = {'Total population':'country'}
pop_df.rename(columns=rename_dict, inplace=True)

In [39]:
pop_df.head()

Unnamed: 0,country,year,population
0,Abkhazia,1800,
1,Afghanistan,1800,3280000.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,410445.0
4,Algeria,1800,2503218.0


In [40]:
# 2. check the missing data (2099 data is missing )
pop_df.isna().sum()

country          0
year             0
population    2099
dtype: int64

In [41]:
# 3. remove missing data
pop_clean = pop_df.dropna()

In [42]:
# 4. filter for relevant data: filter the dataset that it begins with the year 1950
year_filt = pop_clean['year'] > 1949

In [43]:
pop_clean[year_filt]

Unnamed: 0,country,year,population
4126,Afghanistan,1950,7752118.0
4127,Akrotiri and Dhekelia,1950,10661.0
4128,Albania,1950,1263171.0
4129,Algeria,1950,8872247.0
4130,American Samoa,1950,18937.0
...,...,...,...
22256,Zambia,2015,16211767.0
22257,Zimbabwe,2015,15602751.0
22259,South Sudan,2015,12339812.0
22260,Curaçao,2015,157203.0


In [44]:
# 5 make data persistent and save
pop_final = pop_clean[year_filt]

In [45]:
pop_final.to_csv('../data/population_clean.csv', index=False)

# Fertility Data

In [46]:
# 1. rename columns
fert_df = pd.read_csv('../data/fertility_rate.csv')
rename_dict = {'Total fertility rate':'country'}
fert_df.rename(columns=rename_dict, inplace=True)

In [47]:
fert_df.head()

Unnamed: 0,country,year,fertility
0,Abkhazia,1800,
1,Afghanistan,1800,7.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,4.6
4,Algeria,1800,6.99


In [48]:
fert_df.columns

Index(['country', 'year', 'fertility'], dtype='object')

In [49]:
# 2. check the missing data (12747 data is missing )
fert_df.isna().sum()

country          0
year             0
fertility    12747
dtype: int64

In [50]:
# 3. remove missing data
fert_clean = fert_df.dropna()

In [51]:
# 4. filter for relevant data: filter the dataset that it begins with the year 1950
fert_filt = fert_clean['year'] > 1949

In [53]:
fert_clean[fert_filt]

Unnamed: 0,country,year,fertility
39001,Afghanistan,1950,7.67
39003,Albania,1950,5.80
39004,Algeria,1950,7.65
39007,Angola,1950,6.93
39009,Antigua and Barbuda,1950,4.45
...,...,...,...
56150,Vietnam,2015,1.70
56151,Virgin Islands (U.S.),2015,2.45
56154,Yemen,2015,3.83
56156,Zambia,2015,5.59


In [54]:
# 5. make data persistent
fert_final = fert_clean[fert_filt]

In [57]:
fert_final.to_csv('../data/fertility_rate_cleaned.csv')

# Life Expectancy File

In [58]:
life_exp = pd.read_csv('../data/life_expectancy.csv')
life_exp

Unnamed: 0.1,Unnamed: 0,Life expectancy,year,life expectancy
0,0,Abkhazia,1800,
1,1,Afghanistan,1800,28.21
2,2,Akrotiri and Dhekelia,1800,
3,3,Albania,1800,35.40
4,4,Algeria,1800,28.82
...,...,...,...,...
56415,56415,Yugoslavia,2016,
56416,56416,Zambia,2016,57.10
56417,56417,Zimbabwe,2016,61.69
56418,56418,Åland,2016,


In [59]:
# Delete a single column by name
column_to_delete = 'Unnamed: 0'
life_exp = life_exp.drop(columns=column_to_delete)

In [60]:
life_exp

Unnamed: 0,Life expectancy,year,life expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.40
4,Algeria,1800,28.82
...,...,...,...
56415,Yugoslavia,2016,
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69
56418,Åland,2016,


In [47]:
life_exp

Unnamed: 0,Life expectancy,year,life expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.40
4,Algeria,1800,28.82
...,...,...,...
56415,Yugoslavia,2016,
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69
56418,Åland,2016,


In [61]:
# rename column
life_exp.rename(columns={'Life expectancy': 'country'}, inplace=True)

In [62]:
life_exp

Unnamed: 0,country,year,life expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.40
4,Algeria,1800,28.82
...,...,...,...
56415,Yugoslavia,2016,
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69
56418,Åland,2016,


In [63]:
# 2. check missing data
life_exp.isna().sum()

country                0
year                   0
life expectancy    12563
dtype: int64

In [64]:
# 3. remove missing data
life_clean = life_exp.dropna()

In [65]:
# 4. filter for relevant data: filter the dataset that it begins with the year 1950
life_filt = life_clean['year'] > 1949

In [67]:
life_clean[life_filt]

Unnamed: 0,country,year,life expectancy
39001,Afghanistan,1950,26.85
39003,Albania,1950,54.48
39004,Algeria,1950,42.77
39007,Angola,1950,30.70
39009,Antigua and Barbuda,1950,57.97
...,...,...,...
56411,Virgin Islands (U.S.),2016,80.82
56414,Yemen,2016,64.92
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69


In [68]:
life_final = life_clean[life_filt]

In [69]:
life_final.to_csv('../data/life_expectancy_cleaned.csv')