# 01 | Collect the Data
# Introduction
In this notebook, we will explore ways to collect our data more automatically. This process will be reproduced as .py files later on.

# Goals:
Targeted objectives in this notebook ar checked as follows:

- [x] Import the raw data
  -  [x] SIPRI dataset and capitals coordinates
- [x] Store the raw data
- Prepare the data
  - Clean each individual table
  - Store the transformed dataset
- Combine both datasets
  - Check if the merging column values match
- Store the final output

# Set up our working environment

In [9]:
# Import required libraries
import pandas as pd
import os

In [16]:
# Create directory folders to store our data
dirname = os.getcwd()

raw_data = f"{dirname}/data/raw/"
transformed_data = f"{dirname}/data/transformed/"
refined_data = f"{dirname}/data/refined/"

paths = [raw_data, transformed_data, refined_data]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)


# Import and store raw data
## SIPRI dataset
Let us first import the dataset.

In [2]:
# Define SIPRI URL for military expenditure data
# This URL points to the SIPRI website where the data is hosted
sipri_url = "https://www.sipri.org/sites/default/files/SIPRI-Milex-data-1948-2023.xlsx"

We can now store it accordingly at a `raw` folder as it is.

In [22]:
# Read the data
sipri_raw = pd.read_excel(sipri_url, 5)

In [23]:
sipri_raw.to_excel(f"{raw_data}sipri_data_raw", engine="openpyxl")

## City Capitals Coordinates
We can now import the secondary dataset and store it as a raw file as well.

In [24]:
# Read list of capitals from url
capitals_coords_url = "https://gist.githubusercontent.com/ofou/df09a6834a8421b4f376c875194915c9/raw/355eb56e164ddc3cd1a9467c524422cb674e71a9/country-capital-lat-long-population.csv"
capitals_coords_raw = pd.read_csv(capitals_coords_url)

We can now also store it as a raw `.csv` file.

In [25]:
capitals_coords_raw.to_csv(f"{raw_data}city_capitals_coords.csv", index=False)

Challenges:
- Skip unwanted, descriptive rows.
- Get only the sheets into which we are interested.
- Think on ways to concatenate values.
- Remove region names or pivot them as a new column.

In [7]:
sipri_raw_data = pd.read_excel(
    sipri_url, 5,skiprows=5)

In [8]:
sipri_raw_data.head(10)

Unnamed: 0,Country,Notes,1948,1949,1950,1951,1952,1953,1954,1955,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Africa,,,,,,,,,,...,,,,,,,,,,
2,North Africa,,,,,,,,,,...,,,,,,,,,,
3,Algeria,§,...,...,...,...,...,...,...,...,...,9724.379972,10412.714003,10217.0817,10073.364021,9583.724288,10303.600575,9708.27744,9112.461105,9145.810174,18263.967968
4,Libya,‡§¶,...,...,...,...,...,...,...,...,...,3755.652496,...,...,...,...,...,...,...,...,...
5,Morocco,§,...,...,...,...,...,...,...,...,...,4048.612524,3268.363376,3327.03189,3461.461531,3696.856945,3721.323837,4830.956394,5378.366535,4995.028074,5184.9284
6,Tunisia,,...,...,...,...,...,...,...,...,...,908.359963,979.30683,987.734705,858.949581,844.227367,1000.922131,1157.372367,1250.45438,1156.186916,1208.20419
7,sub-Saharan Africa,,,,,,,,,,...,,,,,,,,,,
8,Angola,§‖,...,...,...,...,...,...,...,...,...,6846.249313,3608.299115,2764.054937,3062.872914,1983.613748,1470.938717,993.594405,981.451012,1622.763732,1270.158265
9,Benin,§,...,...,...,...,...,...,...,...,...,92.990706,90.896086,79.581568,116.142782,90.212426,68.11239,71.817818,97.424489,110.01037,140.776011


In [7]:
columns_to_drop = data_skiprows.columns[1]
data = data_skiprows.drop(columns=columns_to_drop)

In [8]:
data

Unnamed: 0,Country,1948,1949,1950,1951,1952,1953,1954,1955,1956,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Africa,,,,,,,,,,...,,,,,,,,,,
2,North Africa,,,,,,,,,,...,,,,,,,,,,
3,Algeria,...,...,...,...,...,...,...,...,...,...,9724.379972,10412.714003,10217.0817,10073.364021,9583.724288,10303.600575,9708.27744,9112.461105,9145.810174,18263.967968
4,Libya,...,...,...,...,...,...,...,...,...,...,3755.652496,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Syria,...,...,...,...,...,...,...,...,55.586592,...,...,...,...,...,...,...,...,...,...,...
189,Türkiye,...,197.68186,212.970206,231.81398,257.768613,294.03399,332.077082,382.919718,412.074237,...,17576.538471,15668.75,17827.702151,17822.738263,19648.693824,20436.917121,17478.413685,15567.410029,10779.896285,15827.853255
190,United Arab Emirates,...,...,...,...,...,...,...,...,...,...,22755.071477,...,...,...,...,...,...,...,...,...
191,"Yemen, North",...,...,...,...,...,...,...,...,...,...,xxx,xxx,xxx,xxx,xxx,xxx,xxx,xxx,xxx,xxx


In [9]:
columns_to_pivot = data.columns[1:]
data_pivoted = data.melt(id_vars='Country', value_vars=columns_to_pivot, var_name='Year', value_name='military_expenditures_usd')

In [10]:
data_pivoted.loc[data_pivoted['Country'] == 'France']

Unnamed: 0,Country,Year,military_expenditures_usd
160,France,1948,...
353,France,1949,1211.32224
546,France,1950,1342.960581
739,France,1951,2114.457981
932,France,1952,3007.074608
...,...,...,...
13863,France,2019,50118.929212
14056,France,2020,52747.064858
14249,France,2021,56646.996216
14442,France,2022,53638.748769


In [11]:
data_pivoted["Country"].unique()

array([nan, 'Africa', 'North Africa', 'Algeria', 'Libya', 'Morocco',
       'Tunisia', 'sub-Saharan Africa', 'Angola', 'Benin', 'Botswana',
       'Burkina Faso', 'Burundi', 'Cameroon', 'Cape Verde',
       'Central African Republic', 'Chad', 'Congo, DR', 'Congo, Republic',
       "Cote d'Ivoire", 'Djibouti', 'Equatorial Guinea', 'Eritrea',
       'Ethiopia', 'Gabon', 'Gambia, The', 'Ghana', 'Guinea',
       'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Madagascar',
       'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Mozambique',
       'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Senegal', 'Seychelles',
       'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan',
       'Eswatini', 'Tanzania', 'Togo', 'Uganda', 'Zambia', 'Zimbabwe',
       'Americas', 'Central America and the Caribbean', 'Belize',
       'Costa Rica', 'Cuba', 'Dominican Republic', 'El Salvador',
       'Guatemala', 'Haiti', 'Honduras', 'Jamaica', 'Mexico', 'Nicaragua',
       'Panama', 'Trinidad and Toba

In [12]:
data_cleaned = data_pivoted.dropna(subset=['military_expenditures_usd'])
data_cleaned

Unnamed: 0,Country,Year,military_expenditures_usd
3,Algeria,1948,...
4,Libya,1948,...
5,Morocco,1948,...
6,Tunisia,1948,...
8,Angola,1948,...
...,...,...,...
14663,Syria,2023,...
14664,Türkiye,2023,15827.853255
14665,United Arab Emirates,2023,...
14666,"Yemen, North",2023,xxx


In [13]:
data_cleaned["Country"].unique()

array(['Algeria', 'Libya', 'Morocco', 'Tunisia', 'Angola', 'Benin',
       'Botswana', 'Burkina Faso', 'Burundi', 'Cameroon', 'Cape Verde',
       'Central African Republic', 'Chad', 'Congo, DR', 'Congo, Republic',
       "Cote d'Ivoire", 'Djibouti', 'Equatorial Guinea', 'Eritrea',
       'Ethiopia', 'Gabon', 'Gambia, The', 'Ghana', 'Guinea',
       'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Madagascar',
       'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Mozambique',
       'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Senegal', 'Seychelles',
       'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan',
       'Eswatini', 'Tanzania', 'Togo', 'Uganda', 'Zambia', 'Zimbabwe',
       'Belize', 'Costa Rica', 'Cuba', 'Dominican Republic',
       'El Salvador', 'Guatemala', 'Haiti', 'Honduras', 'Jamaica',
       'Mexico', 'Nicaragua', 'Panama', 'Trinidad and Tobago', 'Canada',
       'United States of America', 'Argentina', 'Bolivia', 'Brazil',
       'Chile', 'Colombia', 'Ecu

In [14]:
# Get years from 1990 to 2024
years = data_cleaned['Year'].astype(int)
years = years[(years >= 1990) & (years <= 2024)]
years = years.unique()
data_cleaned = data_cleaned[data_cleaned['Year'].isin(years)]

In [15]:
data_cleaned.loc[data_cleaned['Country'] == 'France']

Unnamed: 0,Country,Year,military_expenditures_usd
8266,France,1990,35774.430325
8459,France,1991,35869.120915
8652,France,1992,37902.269821
8845,France,1993,35775.273293
9038,France,1994,37288.60606
9231,France,1995,40124.024138
9424,France,1996,38977.734096
9617,France,1997,34697.904099
9810,France,1998,33633.561283
10003,France,1999,32672.714285


## Data on % of GDP

In [16]:
# read the data
raw_data_pct_gdp = pd.read_excel(url, 6, skiprows=5)

In [17]:
raw_data_pct_gdp.head(10)

Unnamed: 0,Country,Notes,1948,1949,1950,1951,1952,1953,1954,1955,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Africa,,,,,,,,,,...,,,,,,,,,,
2,North Africa,,,,,,,,,,...,,,,,,,,,,
3,Algeria,§,...,...,...,...,...,...,...,...,...,0.055485,0.062702,0.063849,0.059244,0.054836,0.059999,0.066587,0.055708,0.04695,0.081717
4,Libya,‡§¶,...,...,...,...,...,...,...,...,...,0.154796,...,...,...,...,...,...,...,...,...
5,Morocco,§,...,...,...,...,...,...,...,...,...,0.033985,0.0296,0.029821,0.029459,0.029029,0.028658,0.039797,0.037552,0.038146,0.036387
6,Tunisia,,...,...,...,...,...,...,...,...,...,0.018069,0.021396,0.02225,0.020318,0.019764,0.023833,0.027098,0.026367,0.024734,0.023579
7,sub-Saharan Africa,,,,,,,,,,...,,,,,,,,,,
8,Angola,§‖,...,...,...,...,...,...,...,...,...,0.046985,0.031054,0.027333,0.02508,0.019558,0.017391,0.01737,0.01301,0.013168,0.013325
9,Benin,§,...,...,...,...,...,...,...,...,...,0.007009,0.007985,0.006736,0.009206,0.006329,0.004733,0.004588,0.005509,0.006321,0.007112


In [18]:
# Drop Notes column
columns_to_drop = raw_data_pct_gdp.columns[1]
data_pct_gdp = raw_data_pct_gdp.drop(columns=columns_to_drop)

# Melt the data
columns_to_pivot = data_pct_gdp.columns[1:]
data_pct_gdp_pivoted = data_pct_gdp.melt(id_vars='Country', value_vars=columns_to_pivot, var_name='Year', value_name='military_expenditures_pct_gdp')

# Drop rows with missing values
data_pct_gdp_cleaned = data_pct_gdp_pivoted.dropna(subset=['military_expenditures_pct_gdp'])

# Get years from 1990 to 2024
years = data_pct_gdp_cleaned['Year'].astype(int)
years = years[(years >= 1990) & (years <= 2024)]
years = years.unique()
data_pct_gdp_cleaned = data_pct_gdp_cleaned[data_pct_gdp_cleaned['Year'].isin(years)]
data_pct_gdp_cleaned


Unnamed: 0,Country,Year,military_expenditures_pct_gdp
8109,Algeria,1990,0.014611
8110,Libya,1990,...
8111,Morocco,1990,0.041488
8112,Tunisia,1990,0.018395
8114,Angola,1990,0.174637
...,...,...,...
14663,Syria,2023,...
14664,Türkiye,2023,0.015027
14665,United Arab Emirates,2023,...
14666,"Yemen, North",2023,xxx


In [19]:
data_pct_gdp_cleaned.loc[data_pct_gdp_cleaned['Country'] == 'France']

Unnamed: 0,Country,Year,military_expenditures_pct_gdp
8266,France,1990,0.028053
8459,France,1991,0.028121
8652,France,1992,0.026907
8845,France,1993,0.026895
9038,France,1994,0.026604
9231,France,1995,0.024925
9424,France,1996,0.024145
9617,France,1997,0.023754
9810,France,1998,0.022262
10003,France,1999,0.021778


## Get % of govt spending

In [20]:
# read the data
raw_data_pct_govt = pd.read_excel(url, 8, skiprows=7)

In [21]:
raw_data_pct_govt.head(10)

Unnamed: 0,Country,Notes,Reporting year,1988,1989,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Africa,,,,,,,,,,...,,,,,,,,,,
2,North Africa,,,,,,,,,,...,,,,,,,,,,
3,Algeria,§,,...,...,0.057841,0.040441,0.069904,0.070861,0.093684,...,0.134377,0.135656,0.152039,0.146409,0.135999,0.143587,0.15694,0.15025,0.126723,0.193357
4,Libya,‡§¶,,...,...,...,...,...,...,...,...,0.109068,...,...,...,...,...,...,...,...,...
5,Morocco,§,,...,...,0.145002,0.158037,0.15113,0.158109,0.162983,...,0.110783,0.104176,0.104448,0.105925,0.104896,0.104736,0.116552,0.120053,0.118359,0.11122
6,Tunisia,,,...,...,...,0.063269,0.063291,0.061916,0.064018,...,0.065263,0.07796,0.08176,0.070733,0.068902,0.080702,0.078555,0.079197,0.070461,0.070458
7,sub-Saharan Africa,,,,,,,,,,...,,,,,,,,,,
8,Angola,§‖,,...,...,...,...,...,...,...,...,0.128877,0.114798,0.124,0.104141,0.095039,0.085236,0.074624,0.066798,0.058413,0.055282
9,Benin,§,,...,0.104652,0.092393,...,...,...,...,...,0.049276,0.043987,0.043661,0.051763,0.038229,0.032416,0.024055,0.027745,0.031821,0.03754


In [22]:
# Drop Columns
columns_to_drop = raw_data_pct_govt.columns[1:3]
data_pct_govt = raw_data_pct_govt.drop(columns=columns_to_drop)

# Melt the data
columns_to_pivot = data_pct_govt.columns[1:]
data_pct_govt_pivoted = data_pct_govt.melt(id_vars='Country', value_vars=columns_to_pivot, var_name='Year', value_name='military_expenditures_pct_govt')

# Drop rows with missing values
data_pct_govt_cleaned = data_pct_govt_pivoted.dropna(subset=['military_expenditures_pct_govt'])

# Get years from 1990 to 2024
years = data_pct_govt_cleaned['Year'].astype(int)
years = years[(years >= 1990) & (years <= 2024)]
years = years.unique()
data_pct_govt_cleaned = data_pct_govt_cleaned[data_pct_govt_cleaned['Year'].isin(years)]
data_pct_govt_cleaned



Unnamed: 0,Country,Year,military_expenditures_pct_govt
389,Algeria,1990,0.057841
390,Libya,1990,...
391,Morocco,1990,0.145002
392,Tunisia,1990,...
394,Angola,1990,...
...,...,...,...
6943,Syria,2023,...
6944,Türkiye,2023,...
6945,United Arab Emirates,2023,...
6946,"Yemen, North",2023,xxx


In [23]:
data_pct_govt_cleaned.loc[data_pct_govt_cleaned['Country'] == 'Ukraine']

Unnamed: 0,Country,Year,military_expenditures_pct_govt
538,Ukraine,1990,...
731,Ukraine,1991,...
924,Ukraine,1992,...
1117,Ukraine,1993,...
1310,Ukraine,1994,...
1503,Ukraine,1995,0.066237
1696,Ukraine,1996,0.08616
1889,Ukraine,1997,0.097709
2082,Ukraine,1998,0.087474
2275,Ukraine,1999,0.111554


In [24]:
data_pct_govt.head(10)

Unnamed: 0,Country,1988,1989,1990,1991,1992,1993,1994,1995,1996,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Africa,,,,,,,,,,...,,,,,,,,,,
2,North Africa,,,,,,,,,,...,,,,,,,,,,
3,Algeria,...,...,0.057841,0.040441,0.069904,0.070861,0.093684,0.093543,0.106039,...,0.134377,0.135656,0.152039,0.146409,0.135999,0.143587,0.15694,0.15025,0.126723,0.193357
4,Libya,...,...,...,...,...,...,...,...,...,...,0.109068,...,...,...,...,...,...,...,...,...
5,Morocco,...,...,0.145002,0.158037,0.15113,0.158109,0.162983,0.16004,0.17614,...,0.110783,0.104176,0.104448,0.105925,0.104896,0.104736,0.116552,0.120053,0.118359,0.11122
6,Tunisia,...,...,...,0.063269,0.063291,0.061916,0.064018,0.062582,0.066741,...,0.065263,0.07796,0.08176,0.070733,0.068902,0.080702,0.078555,0.079197,0.070461,0.070458
7,sub-Saharan Africa,,,,,,,,,,...,,,,,,,,,,
8,Angola,...,...,...,...,...,...,...,...,0.060329,...,0.128877,0.114798,0.124,0.104141,0.095039,0.085236,0.074624,0.066798,0.058413,0.055282
9,Benin,...,0.104652,0.092393,...,...,...,...,...,...,...,0.049276,0.043987,0.043661,0.051763,0.038229,0.032416,0.024055,0.027745,0.031821,0.03754


In [25]:
# Join the three datasets
data_final = data_cleaned.merge(data_pct_gdp_cleaned, on=['Country', 'Year'])
data_final = data_final.merge(data_pct_govt_cleaned, on=['Country', 'Year'])
data_final

Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt
0,Algeria,1990,904.269155,0.014611,0.057841
1,Libya,1990,...,...,...
2,Morocco,1990,1069.720492,0.041488,0.145002
3,Tunisia,1990,247.855882,0.018395,...
4,Angola,1990,1751.153152,0.174637,...
...,...,...,...,...,...
5911,Syria,2023,...,...,...
5912,Türkiye,2023,15827.853255,0.015027,...
5913,United Arab Emirates,2023,...,...,...
5914,"Yemen, North",2023,xxx,xxx,xxx


In [26]:
# Replace all non numeric values with NaN
data_final = data_final.replace(['xxx', '...'], pd.NA)


  data_final = data_final.replace(['xxx', '...'], pd.NA)


In [27]:
data_final

Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt
0,Algeria,1990,904.269155,0.014611,0.057841
1,Libya,1990,,,
2,Morocco,1990,1069.720492,0.041488,0.145002
3,Tunisia,1990,247.855882,0.018395,
4,Angola,1990,1751.153152,0.174637,
...,...,...,...,...,...
5911,Syria,2023,,,
5912,Türkiye,2023,15827.853255,0.015027,
5913,United Arab Emirates,2023,,,
5914,"Yemen, North",2023,,,


In [28]:
data_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5916 entries, 0 to 5915
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Country                         5916 non-null   object
 1   Year                            5916 non-null   int64 
 2   military_expenditures_usd       5140 non-null   object
 3   military_expenditures_pct_gdp   4963 non-null   object
 4   military_expenditures_pct_govt  4472 non-null   object
dtypes: int64(1), object(4)
memory usage: 231.2+ KB


In [29]:
# Convert columns to numeric
data_final['military_expenditures_usd'] = pd.to_numeric(data_final['military_expenditures_usd'])
data_final['military_expenditures_pct_gdp'] = pd.to_numeric(data_final['military_expenditures_pct_gdp'])
data_final['military_expenditures_pct_govt'] = pd.to_numeric(data_final['military_expenditures_pct_govt'])
data_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5916 entries, 0 to 5915
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country                         5916 non-null   object 
 1   Year                            5916 non-null   int64  
 2   military_expenditures_usd       5140 non-null   float64
 3   military_expenditures_pct_gdp   4963 non-null   float64
 4   military_expenditures_pct_govt  4472 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 231.2+ KB


In [38]:
# Find if data contains any country that starts with T
data_final[data_final['Country'].str.startswith('T')]

Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt
3,Tunisia,1990,247.855882,0.018395,
46,Tanzania,1990,87.154458,0.020465,
47,Togo,1990,50.748352,0.031225,0.108662
63,Trinidad and Tobago,1990,,,
92,Taiwan,1990,8701.148998,0.052229,0.184107
...,...,...,...,...,...
5843,Thailand,2023,5765.771787,0.011656,0.049164
5844,Timor Leste,2023,55.111000,0.012619,0.032116
5848,Tajikistan,2023,139.525232,0.012240,0.037797
5849,Turkmenistan,2023,,,


In [39]:
# Get only countries from europe
europe = ['Albania', 'Andorra', 'Austria', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Türkiye', 'Ukraine', 'United Kingdom', 'Vatican City']
data_final_europe = data_final[data_final['Country'].isin(europe)]
data_final_europe


Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt
109,Albania,1990,0.000000,0.058883,
110,Bosnia and Herzegovina,1990,,,
111,Bulgaria,1990,794.520548,0.038334,
112,Croatia,1990,,,
115,Estonia,1990,,,
...,...,...,...,...,...
5895,Spain,2023,23699.130514,0.015090,0.032065
5896,Sweden,2023,8754.872951,0.014734,0.030613
5897,Switzerland,2023,6293.390648,0.007027,0.022025
5898,United Kingdom,2023,74942.843460,0.022649,0.051524


In [53]:
# Read list of capitals from url
url = "https://gist.githubusercontent.com/ofou/df09a6834a8421b4f376c875194915c9/raw/355eb56e164ddc3cd1a9467c524422cb674e71a9/country-capital-lat-long-population.csv"

capitals = pd.read_csv(url)
capitals.head(10)

Unnamed: 0,Country,Capital City,Latitude,Longitude,Population,Capital Type
0,Afghanistan,Kabul,34.5289,69.1725,4011770,Capital
1,Albania,Tiranë (Tirana),41.3275,19.8189,475577,Capital
2,Algeria,El Djazaïr (Algiers),36.7525,3.042,2693542,Capital
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,Capital
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,Capital
5,Angola,Luanda,-8.8368,13.2343,7774200,Capital
6,Anguilla,The Valley,18.217,-63.0578,1402,Capital
7,Antigua and Barbuda,St. John's,17.1172,-61.8457,20764,Capital
8,Argentina,Buenos Aires,-34.6051,-58.4004,14966530,Capital
9,Armenia,Yerevan,40.182,44.5146,1080324,Capital


In [54]:
capitals["Country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Caribbean Netherlands', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Channel Islands', 'Chile',
       'China', 'China, Hong Kong SAR', 'China, Macao SAR',
       'China, Taiwan Province of China', 'Colombia', 'Comoros', 'Congo',
       'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba',
       'Curaçao', 'Cyprus', 'Czechia', "Dem. People's Republic of Korea",
       'Democratic Republ

In [55]:
# Replace country names to match the data_final_europe
capitals = capitals.replace('Czechia', 'Czech Republic')
capitals = capitals.replace('Russian Federation', 'Russia')
capitals = capitals.replace('Turkey', 'Türkiye')


In [56]:
# Get only the four first columns
capitals = capitals.iloc[:, :4]
capitals

Unnamed: 0,Country,Capital City,Latitude,Longitude
0,Afghanistan,Kabul,34.5289,69.1725
1,Albania,Tiranë (Tirana),41.3275,19.8189
2,Algeria,El Djazaïr (Algiers),36.7525,3.0420
3,American Samoa,Pago Pago,-14.2781,-170.7025
4,Andorra,Andorra la Vella,42.5078,1.5211
...,...,...,...,...
229,Wallis and Futuna Islands,Matu-Utu,-13.2816,-176.1745
230,Western Sahara,El Aaiún,27.1532,-13.2014
231,Yemen,Sana'a',15.3531,44.2078
232,Zambia,Lusaka,-15.4134,28.2771


In [57]:
# Merge the data
data_final_europe = data_final_europe.merge(capitals, left_on='Country', right_on='Country', how='left')


MergeError: Passing 'suffixes' which cause duplicate columns {'Longitude_x', 'Capital City_x', 'Latitude_x'} is not allowed.

In [58]:
data_final_europe

Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt,Capital City_x,Latitude_x,Longitude_x,Capital City_y,Latitude_y,Longitude_y,Capital City,Latitude,Longitude
0,Albania,1990,0.000000,0.058883,,Tiranë (Tirana),41.3275,19.8189,Tiranë (Tirana),41.3275,19.8189,Tiranë (Tirana),41.3275,19.8189
1,Bosnia and Herzegovina,1990,,,,Sarajevo,43.8486,18.3564,Sarajevo,43.8486,18.3564,Sarajevo,43.8486,18.3564
2,Bulgaria,1990,794.520548,0.038334,,Sofia,42.6975,23.3242,Sofia,42.6975,23.3242,Sofia,42.6975,23.3242
3,Croatia,1990,,,,Zagreb,45.8144,15.9780,Zagreb,45.8144,15.9780,Zagreb,45.8144,15.9780
4,Estonia,1990,,,,Tallinn,59.4370,24.7535,Tallinn,59.4370,24.7535,Tallinn,59.4370,24.7535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1389,Spain,2023,23699.130514,0.015090,0.032065,Madrid,40.4165,-3.7026,Madrid,40.4165,-3.7026,Madrid,40.4165,-3.7026
1390,Sweden,2023,8754.872951,0.014734,0.030613,Stockholm,59.3326,18.0649,Stockholm,59.3326,18.0649,Stockholm,59.3326,18.0649
1391,Switzerland,2023,6293.390648,0.007027,0.022025,Bern,46.9481,7.4474,Bern,46.9481,7.4474,Bern,46.9481,7.4474
1392,United Kingdom,2023,74942.843460,0.022649,0.051524,London,51.5085,-0.1257,London,51.5085,-0.1257,London,51.5085,-0.1257


In [59]:
data_final_europe.loc[data_final_europe['Country'] == 'Russia']

Unnamed: 0,Country,Year,military_expenditures_usd,military_expenditures_pct_gdp,military_expenditures_pct_govt,Capital City_x,Latitude_x,Longitude_x,Capital City_y,Latitude_y,Longitude_y,Capital City,Latitude,Longitude
18,Russia,1990,0.0,,,,,,,,,Moskva (Moscow),55.755,37.6218
59,Russia,1991,,,,,,,,,,Moskva (Moscow),55.755,37.6218
100,Russia,1992,0.0,0.04427,,,,,,,,Moskva (Moscow),55.755,37.6218
141,Russia,1993,7766.720078,0.041813,,,,,,,,Moskva (Moscow),55.755,37.6218
182,Russia,1994,13547.871733,0.045237,,,,,,,,Moskva (Moscow),55.755,37.6218
223,Russia,1995,12741.62947,0.037844,,,,,,,,Moskva (Moscow),55.755,37.6218
264,Russia,1996,15826.340652,0.037566,,,,,,,,Moskva (Moscow),55.755,37.6218
305,Russia,1997,17577.353181,0.040398,,,,,,,,Moskva (Moscow),55.755,37.6218
346,Russia,1998,7955.730401,0.027326,0.069055,,,,,,,Moskva (Moscow),55.755,37.6218
387,Russia,1999,6469.035211,0.030733,0.089886,,,,,,,Moskva (Moscow),55.755,37.6218


In [None]:
# create function that calculates the distance in km between the coordinates of one city to the city of moscow
from geopy.distance import geodesic

In [35]:
data_final_europe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360 entries, 0 to 1359
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country                         1360 non-null   object 
 1   Year                            1360 non-null   int64  
 2   military_expenditures_usd       1279 non-null   float64
 3   military_expenditures_pct_gdp   1243 non-null   float64
 4   military_expenditures_pct_govt  1161 non-null   float64
 5   Capital City                    1224 non-null   object 
 6   Latitude                        1224 non-null   float64
 7   Longitude                       1224 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 85.1+ KB


In [36]:
# Export the data
data_final_europe.to_csv('military_expenditures_europe.csv', index=False)