In [1]:

# Set up notebook
%pprint
%matplotlib inline
import sys
import os.path as osp, os as os

executable_path = sys.executable
scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts'); assert osp.exists(scripts_folder)
py_folder = osp.abspath(osp.join(os.pardir, 'py')); assert osp.exists(py_folder), "Create the py folder"
ffmpeg_folder = r'C:\ffmpeg\bin'; assert osp.exists(ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, 'share')); assert osp.exists(shared_folder)

if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
if shared_folder not in sys.path: sys.path.insert(1, shared_folder)

from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves'))
)
nu.delete_ipynb_checkpoint_folders()

from pathlib import Path
root = Path(Path(os.sep).resolve().drive + os.sep)

# Import needed libraries
import re
import pandas as pd
import pyperclip
import ipywidgets as widgets
from IPython.display import display
import inspect

Pretty printing has been turned OFF


In [2]:

# Get a hand-valued dictionary of font sizes for each country
fontsize_dict = {
    'Iran': 140,
    'Turkey': 82,
    'Iraq': 76,
    'Egypt': 60,
    'Yemen': 50,
    'Saudi Arabia': 46,
    'Oman': 36,
    'Syria': 36,
    'Jordan': 18,
    'Israel': 8,
    'Kuwait': 8,
    'United Arab Emirates': 6,
    'Lebanon': 5,
    'Qatar': 4,
    'Bahrain': 1,
}

In [288]:

from pandas import Index, concat

country_populations_df = DataFrame([], columns=['year', 'country_name', 'population'])
wiki_url = 'https://en.wikipedia.org/wiki/'
country_areas

[('Saudi Arabia', 331.7788232081368, 34719418.0), ('Iran', 284.01399394254867, 79336000.0), ('Turkey', 120.20356206680844, 83614362.0), ('Egypt', 117.98247035326742, 98154000.0), ('Yemen', 82.57279020407178, 29719300.0), ('Iraq', 81.33432328579566, 40575000.0), ('Oman', 76.64253424717411, 4939200.0), ('Syria', 33.34787577668872, 21377000.0), ('Jordan', 18.1596585328054, 10248069.0), ('United Arab Emirates', 16.627817659608485, 9154000.0), ('Israel', 6.422839435348814, 9100000.0), ('Kuwait', 2.9734908882563205, 4259500.0), ('Lebanon', 2.455264542634866, 6830600.0), ('Qatar', 1.3850950067221495, 2412483.0), ('Bahrain', 0.31085804838628295, 1701000.0)]


### Saudi Arabia

In [289]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Saudi Arabia'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[5].copy()
df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'mid_year_population_thousands': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(str(s).replace(' ', '')) * 1000).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
country_populations_df.sample(5)

[(5, (72, 11)), (3, (12, 11)), (4, (22, 5)), (11, (16, 3)), (1, (15, 3)), (16, (15, 3)), (0, (21, 2)), (14, (8, 2)), (10, (3, 5)), (12, (3, 5)), (2, (5, 2)), (18, (4, 2)), (24, (4, 2)), (25, (4, 2)), (15, (3, 2)), (21, (3, 2)), (23, (3, 2)), (19, (2, 2)), (6, (1, 2)), (7, (1, 2)), (8, (1, 2)), (9, (1, 2)), (13, (1, 2)), (17, (1, 2)), (20, (1, 2)), (22, (1, 2))]


Unnamed: 0,year,country_name,population
33,1983,Saudi Arabia,11746000
63,2013,Saudi Arabia,31482000
0,1950,Saudi Arabia,3090000
66,2016,Saudi Arabia,33416000
20,1970,Saudi Arabia,6106000



### Iran

In [290]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Iran'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[4].copy()
df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'population_on_1_july': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(4, (73, 11)), (5, (65, 9)), (3, (29, 5)), (15, (32, 4)), (13, (22, 5)), (16, (32, 3)), (11, (19, 5)), (18, (5, 13)), (0, (27, 2)), (19, (2, 26)), (1, (8, 6)), (17, (8, 5)), (14, (6, 4)), (10, (3, 4)), (12, (2, 5)), (21, (5, 2)), (24, (4, 2)), (20, (2, 2)), (22, (2, 2)), (23, (2, 2)), (2, (1, 2)), (6, (1, 2)), (7, (1, 2)), (8, (1, 2)), (9, (1, 2))]


Unnamed: 0,year,country_name,population
0,2014,Iran,79962000
1,1950,Saudi Arabia,3090000
2,2015,Saudi Arabia,32750000
3,2007,Saudi Arabia,26400000
4,1976,Saudi Arabia,8320000



### Turkey

In [291]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Turkey'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[5].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'population_31_12': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(16, (93, 19)), (5, (34, 10)), (7, (13, 17)), (8, (13, 17)), (9, (13, 17)), (10, (13, 17)), (31, (44, 5)), (11, (13, 16)), (32, (13, 14)), (33, (36, 5)), (34, (12, 15)), (6, (17, 9)), (27, (24, 5)), (28, (24, 5)), (29, (24, 5)), (30, (24, 5)), (17, (20, 5)), (19, (20, 5)), (21, (20, 5)), (23, (20, 5)), (25, (20, 5)), (13, (9, 9)), (1, (25, 3)), (0, (29, 2)), (35, (7, 7)), (36, (19, 2)), (2, (11, 3)), (3, (11, 3)), (4, (7, 4)), (18, (3, 5)), (20, (3, 5)), (22, (3, 5)), (24, (3, 5)), (26, (3, 5)), (12, (2, 7)), (37, (6, 2)), (42, (4, 2)), (43, (4, 2)), (39, (3, 2)), (41, (3, 2)), (38, (2, 2)), (40, (1, 2)), (14, (1, 1)), (15, (1, 1))]


Unnamed: 0,year,country_name,population
0,2014,Turkey,77695904
1,1973,Iran,30982000
2,1969,Saudi Arabia,5845000
3,1990,Iran,55794000
4,1952,Iran,17614000



### Egypt

In [292]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Egypt'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[20].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", cn).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'midyear_population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == 'Egypt')
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(20, (90, 10)), (25, (28, 7)), (19, (25, 6)), (24, (28, 5)), (9, (18, 5)), (10, (18, 5)), (7, (17, 5)), (8, (17, 5)), (12, (17, 5)), (16, (17, 5)), (6, (16, 5)), (14, (17, 4)), (22, (9, 7)), (27, (2, 30)), (5, (19, 3)), (28, (21, 2)), (0, (20, 2)), (18, (3, 10)), (23, (7, 4)), (13, (4, 6)), (11, (3, 5)), (17, (3, 5)), (26, (3, 5)), (1, (12, 1)), (15, (3, 4)), (21, (3, 4)), (29, (4, 2)), (37, (4, 2)), (38, (4, 2)), (30, (3, 2)), (34, (3, 2)), (36, (3, 2)), (35, (2, 2)), (2, (1, 2)), (3, (1, 2)), (4, (1, 2)), (31, (1, 2)), (32, (1, 2)), (33, (1, 2))]


Unnamed: 0,year,country_name,population
0,2017,Egypt,95203000
1,1953,Iran,18018000
2,2020,Turkey,83614362
3,1962,Saudi Arabia,4459000
4,2021,Saudi Arabia,35950000



### Yemen

In [293]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Yemen'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[6].copy().iloc[:-1]

df.columns = [re.sub("[^A-Za-z0-9]+", "_", cn).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'year_28': 'year', 'population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(s.replace(' ', ''))).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == 'Yemen')
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(6, (73, 11)), (8, (22, 5)), (9, (22, 5)), (4, (8, 13)), (1, (25, 2)), (2, (15, 3)), (16, (14, 3)), (5, (7, 4)), (10, (7, 4)), (3, (3, 5)), (7, (2, 7)), (15, (4, 2)), (22, (4, 2)), (23, (4, 2)), (17, (2, 2)), (19, (2, 2)), (21, (2, 2)), (24, (2, 2)), (0, (1, 2)), (11, (1, 2)), (12, (1, 2)), (13, (1, 2)), (14, (1, 2)), (18, (1, 2)), (20, (1, 2))]


Unnamed: 0,year,country_name,population
0,1951,Yemen,4783000
1,1991,Saudi Arabia,16654000
2,1960,Iran,21389000
3,1961,Saudi Arabia,4306000
4,2017,Turkey,80810525



### Iraq

In [294]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Iraq'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[1].copy().iloc[:-1]

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'year': 'year', 'pop': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(str(s).replace(' ', ''))).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(3, (73, 10)), (10, (24, 5)), (11, (22, 5)), (1, (22, 3)), (0, (25, 2)), (14, (21, 2)), (9, (7, 4)), (8, (9, 3)), (12, (5, 3)), (15, (4, 2)), (22, (4, 2)), (13, (3, 2)), (16, (3, 2)), (20, (3, 2)), (21, (2, 2)), (4, (1, 2)), (5, (1, 2)), (6, (1, 2)), (7, (1, 2)), (17, (1, 2)), (18, (1, 2)), (19, (1, 2)), (2, (1, 1))]


Unnamed: 0,year,country_name,population
0,1000,Iraq,2000000
1,2017,Yemen,30034000
2,1957,Egypt,25756000
3,2012,Saudi Arabia,30822000
4,1964,Saudi Arabia,4795000



### Oman

In [295]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Oman'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[11].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", cn).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'year': 'year', 'population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(str(s).replace(' ', ''))).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(12, (73, 10)), (11, (18, 9)), (7, (26, 5)), (6, (23, 5)), (5, (22, 5)), (4, (15, 5)), (1, (24, 2)), (2, (6, 6)), (3, (9, 3)), (14, (5, 2)), (15, (3, 3)), (16, (4, 2)), (17, (4, 2)), (13, (3, 2)), (0, (1, 2)), (8, (1, 2)), (9, (1, 2)), (10, (1, 2))]


Unnamed: 0,year,country_name,population
0,2016,Oman,4414051
1,2023,Egypt,105174000
2,1992,Iran,59372000
3,1965,Yemen,6097000
4,2015,Egypt,90624000



### Syria

In [296]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Syria'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[8].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'period': 'year', 'population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(str(s).replace(' ', ''))).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(8, (72, 12)), (7, (21, 5)), (4, (22, 3)), (0, (29, 2)), (16, (9, 4)), (13, (15, 2)), (14, (15, 2)), (15, (7, 3)), (1, (10, 2)), (5, (5, 3)), (6, (3, 3)), (17, (4, 2)), (18, (4, 2)), (19, (4, 2)), (2, (1, 2)), (3, (1, 2)), (9, (1, 2)), (10, (1, 2)), (11, (1, 2)), (12, (1, 2))]


Unnamed: 0,year,country_name,population
0,1967,Syria,5723000
1,2016,Turkey,79814871
2,1994,Egypt,56344000
3,1952,Iran,17614000
4,1961,Iran,21984000


In [297]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Jordan'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[7].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'average_population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(7, (73, 10)), (4, (27, 5)), (5, (25, 5)), (6, (25, 5)), (11, (13, 9)), (8, (10, 7)), (0, (27, 2)), (3, (12, 3)), (2, (7, 4)), (13, (13, 2)), (1, (4, 5)), (12, (4, 2)), (18, (4, 2)), (9, (1, 4)), (10, (1, 4)), (17, (2, 2)), (14, (1, 2)), (15, (1, 2)), (16, (1, 2))]


Unnamed: 0,year,country_name,population
0,1961,Jordan,900800
1,1500,Iraq,1000000
2,1943,Egypt,17842000
3,1945,Egypt,18498000
4,1962,Egypt,29591000



### Jordan


### United Arab Emirates

In [298]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'United Arab Emirates'
tables_url = wiki_url + 'Demographics_of_the_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[8].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'year': 'year', 'population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(8, (48, 9)), (12, (97, 4)), (6, (24, 5)), (7, (13, 9)), (3, (27, 2)), (5, (8, 6)), (16, (23, 2)), (4, (15, 3)), (13, (19, 2)), (11, (7, 4)), (19, (10, 2)), (20, (9, 2)), (21, (6, 2)), (14, (4, 2)), (15, (4, 2)), (0, (3, 2)), (23, (3, 2)), (17, (2, 2)), (24, (2, 2)), (1, (1, 2)), (2, (1, 2)), (9, (1, 2)), (10, (1, 2)), (18, (1, 2)), (22, (1, 2))]


Unnamed: 0,year,country_name,population
0,2020,United Arab Emirates,9282410
1,1948,Egypt,19529000
2,2020,Yemen,32284000
3,2002,Jordan,5038000
4,2011,Turkey,74724269



### Israel

In [299]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Israel'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[18].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'year': 'year', 'population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(18, (74, 10)), (12, (53, 6)), (4, (21, 12)), (13, (31, 7)), (2, (16, 13)), (3, (24, 7)), (7, (26, 5)), (10, (11, 10)), (20, (20, 4)), (0, (30, 2)), (11, (4, 8)), (1, (10, 3)), (6, (7, 4)), (5, (2, 12)), (22, (12, 2)), (16, (5, 3)), (21, (7, 2)), (8, (3, 4)), (19, (3, 4)), (27, (4, 2)), (26, (2, 2)), (9, (1, 3)), (14, (1, 2)), (15, (1, 2)), (17, (1, 2)), (23, (1, 2)), (24, (1, 2)), (25, (1, 2))]


Unnamed: 0,year,country_name,population
0,1969,Israel,2930000
1,1941,Egypt,17208000
2,2007,Iran,72319000
3,1986,Iran,48913000
4,1952,Yemen,4856000



### Kuwait

In [300]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Kuwait'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[5].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'average_population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(5, (67, 9)), (10, (62, 4)), (4, (15, 9)), (8, (22, 5)), (1, (24, 2)), (3, (5, 7)), (14, (10, 3)), (9, (7, 4)), (2, (9, 3)), (11, (5, 3)), (19, (4, 2)), (20, (4, 2)), (12, (1, 3)), (13, (1, 3)), (0, (1, 2)), (6, (1, 2)), (7, (1, 2)), (15, (1, 2)), (16, (1, 2)), (17, (1, 2)), (18, (1, 2))]


Unnamed: 0,year,country_name,population
0,1996,Kuwait,1628000
1,2023,Turkey,85372277
2,1935,Egypt,15624000
3,1978,Egypt,43006000
4,1977,Syria,8029000



### Lebanon

In [301]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Lebanon'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[6].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'mid_year_population_thousands': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.map(lambda s: int(str(s).replace(' ', '')) * 1000).astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(6, (72, 12)), (2, (16, 21)), (7, (34, 9)), (4, (17, 6)), (1, (24, 3)), (0, (30, 2)), (5, (9, 3)), (14, (6, 2)), (12, (4, 2)), (13, (4, 2)), (3, (1, 2)), (8, (1, 2)), (9, (1, 2)), (10, (1, 2)), (11, (1, 2)), (15, (1, 2)), (16, (1, 2)), (17, (1, 2))]


Unnamed: 0,year,country_name,population
0,2011,Lebanon,5045000
1,1985,United Arab Emirates,1391000
2,2004,Israel,6870000
3,1985,Iran,47266000
4,1954,Yemen,5011000



### Qatar

In [302]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Qatar'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[5].copy().iloc[:-1]

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'unnamed_0': 'year', 'average_population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(5, (55, 9)), (4, (14, 9)), (6, (22, 5)), (0, (24, 2)), (3, (15, 3)), (7, (8, 4)), (1, (8, 3)), (2, (8, 3)), (10, (11, 2)), (8, (6, 3)), (15, (4, 2)), (9, (2, 3)), (14, (2, 2)), (11, (1, 2)), (12, (1, 2)), (13, (1, 2))]


Unnamed: 0,year,country_name,population
0,2011,Qatar,1733000
1,1970,Kuwait,753000
2,2011,Iran,76343000
3,1951,Egypt,22020000
4,1965,Syria,5368000



### Bahrain

In [303]:

# Retrieve tables from Wikipedia as a list of data frames
country_name = 'Bahrain'
tables_url = wiki_url + 'Demographics_of_' + country_name.replace(' ', '_')
df = nu.get_page_tables(tables_url, verbose=True)[9].copy()

df.columns = [re.sub("[^A-Za-z0-9]+", "_", str(cn)).lower().strip('_') for cn in df.columns]
population_dict = df.rename(columns={'19_20_21': 'year', 'average_population': 'population'}).set_index('year').population.to_dict()
df = DataFrame(population_dict, index=Index(['population'])).T.reset_index(drop=False).rename(columns={'index': 'year'}).dropna(subset='population')
df['country_name'] = country_name
df.year = df.year.map(lambda s: int(str(s).split('[')[0].strip())).astype(int)
df.population = df.population.astype(int)
# display(df); raise
country_populations_df = concat([country_populations_df, df])
mask_series = (country_populations_df['country_name'] == country_name)
concat([country_populations_df[mask_series].sample(1), country_populations_df[~mask_series].sample(4)], ignore_index=True)

[(9, (58, 9)), (8, (13, 9)), (10, (23, 5)), (7, (21, 5)), (6, (10, 6)), (1, (21, 2)), (2, (8, 5)), (12, (5, 6)), (11, (7, 4)), (5, (9, 3)), (4, (8, 3)), (14, (11, 2)), (3, (8, 2)), (19, (5, 2)), (20, (4, 2)), (21, (4, 2)), (13, (3, 2)), (18, (2, 2)), (0, (1, 2)), (15, (1, 2)), (16, (1, 2)), (17, (1, 2))]


Unnamed: 0,year,country_name,population
0,1991,Bahrain,503052
1,2010,Jordan,6594000
2,1981,Qatar,246000
3,1962,Yemen,5753000
4,2015,Turkey,78741053


In [304]:

country_populations_by_year_df = country_populations_df.pivot(index='year', columns='country_name', values='population').apply(
    pd.to_numeric, errors='coerce'
).astype('Int64')

In [339]:

# Reset the index to temporarily treat 'year' as a column
df = country_populations_by_year_df.reset_index()

# Merge the generated years with the original
interpolated_years_df = pd.merge(DataFrame({'year': range(1922, df.year.max() + 1)}), df, on='year', how='left')

# Add in the British Political Agent's estimate of Bahrain's population
mask_series = (interpolated_years_df.year == 1925)
interpolated_years_df.loc[mask_series, 'Bahrain'] = 79_000

# Add in the estimated population of Egypt based on historical trends
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Egypt'] = 12_000_000

# Add in the estimated population of Iran, considering the devastating effects of the 1917-1919 famine
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Iran'] = 10_000_000

# Add in the estimated population of Iraq extrapolated from the 1920 estimate
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Iraq'] = 2_800_000

# Add in the 1922 Census of Palestine conducted by the Mandatory government 
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Israel'] = 757_182

# Add in the estimated population of Transjordan
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Jordan'] = 225_000

# Add in the estimated population of Kuwait and the 1957 census
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Kuwait'] = 35_000
mask_series = (interpolated_years_df.year == 1957)
interpolated_years_df.loc[mask_series, 'Kuwait'] = 206_473

# Add in the resident population of Lebanon from the 1932 French mandate census
mask_series = (interpolated_years_df.year == 1932)
interpolated_years_df.loc[mask_series, 'Lebanon'] = 861_399

# Add the population estimate for Oman and the 1950 estimate
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Oman'] = 299_000
mask_series = (interpolated_years_df.year == 1950)
interpolated_years_df.loc[mask_series, 'Oman'] = 413_000

# Add in the estimated population of Syria from the French mandate and 1947 census
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Syria'] = 1_500_000
mask_series = (interpolated_years_df.year == 1947)
interpolated_years_df.loc[mask_series, 'Syria'] = 4_500_000

# Add in the population of Turkey from the 1927 census
mask_series = (interpolated_years_df.year == 1927)
interpolated_years_df.loc[mask_series, 'Turkey'] = 13_600_000

# Add in the estimated population of the United Arab Emirates
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'United Arab Emirates'] = 180_000

# Add in the estimated population of Yemen
mask_series = (interpolated_years_df.year == 1922)
interpolated_years_df.loc[mask_series, 'Yemen'] = 1_500_000

interpolated_years_df = interpolated_years_df.interpolate(
    method='pchip',
    axis=0,
    limit=None,
    inplace=False,
    limit_direction='both',
    limit_area=None,
).astype('Int64').map(lambda x: max(x, 0)).set_index('year')

In [347]:

nu.store_objects(interpolated_years_df=interpolated_years_df)
interpolated_years_df / interpolated_years_df.loc[2024]

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\interpolated_years_df.pkl


Unnamed: 0_level_0,Bahrain,Egypt,Iran,Iraq,Israel,Jordan,Kuwait,Lebanon,Oman,Qatar,Saudi Arabia,Syria,Turkey,United Arab Emirates,Yemen
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1922,0.050112,0.112393,0.111376,0.065091,0.076283,0.019236,0.007124,0.132991,0.057779,1.114913,0.028014,0.068159,0.131562,0.016322,0.043075
1923,0.049898,0.115962,0.112635,0.066825,0.076330,0.019282,0.007153,0.132693,0.057816,1.050602,0.029828,0.083872,0.136981,0.016329,0.047558
1924,0.049770,0.119345,0.114046,0.068632,0.076476,0.019420,0.007240,0.132781,0.057927,0.988768,0.031686,0.098328,0.142493,0.016351,0.051934
1925,0.049727,0.122551,0.115604,0.070512,0.076728,0.019647,0.007385,0.133243,0.058108,0.929368,0.033587,0.111581,0.148095,0.016388,0.056207
1926,0.049770,0.125587,0.117305,0.072463,0.077092,0.019961,0.007589,0.134066,0.058359,0.872354,0.035530,0.123681,0.153787,0.016441,0.060379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,0.926690,0.942269,0.972202,0.935048,0.928369,0.923834,0.908666,0.958910,0.865918,0.904829,1.091182,0.943916,0.981045,0.841710,0.927091
2021,0.946934,0.955916,0.979253,0.953302,0.947007,0.945293,0.882510,0.947057,0.874885,0.877372,1.089757,0.968953,0.993552,0.894176,0.947135
2022,0.959729,0.970377,0.986236,0.970262,0.973342,0.966238,0.892627,0.945652,0.953419,0.936119,1.079784,0.987786,1.000583,0.932980,0.966029
2023,0.992691,0.985072,0.993152,0.985853,0.991506,0.984534,0.975637,0.961148,0.998203,0.977944,1.052715,0.998705,1.001671,0.968310,0.983681
