In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

## Population Data from https://data.worldbank.org/indicator/SP.POP.TOTL?locations=TH
This dataset might come in handy in calculating per capita values, which is essential to compare values accross countries as no country have the same amount of population which means it is not fair to compare values between each country directly

In [2]:
df = pd.read_csv('../data/PopulationWorldBank.csv', skiprows=4)
df.head(5)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,...,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,106277.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,...,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720859132.0,739108306.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,...,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,...,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,502789511.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,...,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,36684202.0,


In [3]:
df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', 'Unnamed: 68'],
      dtype='object')

## Tidying and Cleaning the Data 

In [4]:
df.nunique() #this indicates the Indicator Name and Code is an irrelevant column as they are all the same (Population)

Country Name      266
Country Code      266
Indicator Name      1
Indicator Code      1
1960              260
                 ... 
2020              263
2021              263
2022              263
2023              263
Unnamed: 68         0
Length: 69, dtype: int64

In [5]:
df.drop(['Indicator Name', 'Indicator Code'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,Aruba,ABW,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,59522.0,...,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,106277.0,
1,Africa Eastern and Southern,AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,...,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720859132.0,739108306.0,
2,Afghanistan,AFG,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,10010030.0,...,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854.0,
3,Africa Western and Central,AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,...,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,502789511.0,
4,Angola,AGO,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,5827503.0,...,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,36684202.0,


In [12]:
df_tidy = df.melt(id_vars = ['Country Name', 'Country Code'],
                  var_name = 'Year',
                  value_name = 'Population')
df_tidy

Unnamed: 0,Country Name,Country Code,Year,Population
0,Aruba,ABW,1960,54608.0
1,Africa Eastern and Southern,AFE,1960,130692579.0
2,Afghanistan,AFG,1960,8622466.0
3,Africa Western and Central,AFW,1960,97256290.0
4,Angola,AGO,1960,5357195.0
...,...,...,...,...
17285,Kosovo,XKX,Unnamed: 68,
17286,"Yemen, Rep.",YEM,Unnamed: 68,
17287,South Africa,ZAF,Unnamed: 68,
17288,Zambia,ZMB,Unnamed: 68,


In [19]:
df_tidy[df_tidy.isna().any(axis=1)].head(20) #from this we can see we can remove na as it just means no population data for 
                                             #that year or country name is not classified

Unnamed: 0,Country Name,Country Code,Year,Population
110,Not classified,INX,1960,
196,West Bank and Gaza,PSE,1960,
376,Not classified,INX,1961,
462,West Bank and Gaza,PSE,1961,
642,Not classified,INX,1962,
728,West Bank and Gaza,PSE,1962,
908,Not classified,INX,1963,
994,West Bank and Gaza,PSE,1963,
1174,Not classified,INX,1964,
1260,West Bank and Gaza,PSE,1964,


In [20]:
df_tidy = df_tidy.dropna()
df_tidy

Unnamed: 0,Country Name,Country Code,Year,Population
0,Aruba,ABW,1960,54608.0
1,Africa Eastern and Southern,AFE,1960,130692579.0
2,Afghanistan,AFG,1960,8622466.0
3,Africa Western and Central,AFW,1960,97256290.0
4,Angola,AGO,1960,5357195.0
...,...,...,...,...
17019,Kosovo,XKX,2023,1756374.0
17020,"Yemen, Rep.",YEM,2023,34449825.0
17021,South Africa,ZAF,2023,60414495.0
17022,Zambia,ZMB,2023,20569737.0


In [22]:
df_tidy.to_csv('../data/TidyPopulationWorldBank.csv', index=False)  