In [119]:
import pandas as pd
import re

def extract_numeric_year(column_name):
  match = re.match(r'(\d+) \[YR(\d+)\]', column_name)
  if match:
    return match.group(1)
  else:
    return column_name
      
# Load the datasets
fertility_df = pd.read_csv("Resources/Population and Ferility Data by Country.csv")
gdp_df = pd.read_csv("Resources/GDP by Country_Data.csv")

In [121]:
# Apply the function to extract numeric year from column names
fertility_df.columns = fertility_df.columns.map(extract_numeric_year)
gdp_df.columns = gdp_df.columns.map(extract_numeric_year)

In [123]:
#Check for missing values in both datasets
fertility_missing = fertility_df.isnull().sum()
gdp_missing = gdp_df.isnull().sum()

print("Fertility Data Missing Values:")
print(fertility_missing)
print("\nGDP Data Missing Values:")
print(gdp_missing)

Fertility Data Missing Values:
Series Name     3
Series Code     5
Country Name    5
Country Code    5
1960            5
               ..
2019            5
2020            5
2021            5
2022            5
2023            5
Length: 68, dtype: int64

GDP Data Missing Values:
Country Name    3
Country Code    5
Series Name     5
Series Code     5
1960            5
               ..
2019            5
2020            5
2021            5
2022            5
2023            5
Length: 68, dtype: int64


In [125]:
# Ensure year columns are integers (Some columns might have extra spaces or different formats)
fertility_df.columns = [col.strip() for col in fertility_df.columns]  # Remove any extra spaces
gdp_df.columns = [col.strip() for col in gdp_df.columns]  # Remove any extra spaces

In [127]:
# Reshape the data (pivot the data so we can have years as columns)
fertility_long = pd.melt(fertility_df, id_vars=["Series Name", "Country Name", "Country Code"], 
                         var_name="Year", value_name="Fertility Rate")
gdp_long = pd.melt(gdp_df, id_vars=["Country Name", "Country Code", "Series Name", "Series Code"], 
                   var_name="Year", value_name="GDP")

In [131]:
# Clean the Year column (remove non-integer values)
fertility_long['Year'] = pd.to_numeric(fertility_long['Year'], errors='coerce')
gdp_long['Year'] = pd.to_numeric(gdp_long['Year'], errors='coerce')

In [147]:
# Remove Decimal and Drop NaN values
fertility_long.dropna(subset=['Year'], inplace=True)
gdp_long.dropna(subset=['Year'], inplace=True)
fertility_long['Year'] = fertility_long['Year'].astype(int)
gdp_long['Year'] = gdp_long['Year'].astype(int)

In [151]:
# Merge the datasets by Country Name, Country Code, and Year
merged_df = pd.merge(fertility_long, gdp_long, on=["Country Name", "Country Code", "Year"], how="inner")

In [155]:
# Ensure the data types are correct (e.g., convert Year to integer, GDP to numeric)
merged_df['Year'] = merged_df['Year'].astype(int)
merged_df['Fertility Rate'] = pd.to_numeric(merged_df['Fertility Rate'], errors='coerce')
merged_df['GDP'] = pd.to_numeric(merged_df['GDP'], errors='coerce')

In [161]:
# Handle any remaining missing values in the merged dataset (if needed)
merged_df = merged_df.dropna(subset=['Fertility Rate', 'GDP'])

In [163]:
merged_df

Unnamed: 0,Series Name_x,Country Name,Country Code,Year,Fertility Rate,Series Name_y,Series Code,GDP
2,"Fertility rate, total (births per woman)",Africa Eastern and Southern,AFE,1960,6.724125,GDP per capita (current US$),NY.GDP.PCAP.CD,162.342517
4,"Fertility rate, total (births per woman)",Africa Western and Central,AFW,1960,6.458448,GDP per capita (current US$),NY.GDP.PCAP.CD,122.193931
8,"Fertility rate, total (births per woman)",Algeria,DZA,1960,7.503000,GDP per capita (current US$),NY.GDP.PCAP.CD,239.033006
26,"Fertility rate, total (births per woman)",Australia,AUS,1960,3.453000,GDP per capita (current US$),NY.GDP.PCAP.CD,1810.597443
28,"Fertility rate, total (births per woman)",Austria,AUT,1960,2.690000,GDP per capita (current US$),NY.GDP.PCAP.CD,943.610800
...,...,...,...,...,...,...,...,...
511659,Urban population growth (annual %),"Yemen, Rep.",YEM,2023,3.838151,GDP per capita (current US$),NY.GDP.PCAP.CD,533.367123
511661,Urban population growth (annual %),Zambia,ZMB,2023,3.967077,GDP per capita (current US$),NY.GDP.PCAP.CD,1369.129365
511662,Urban population growth (annual %),Zambia,ZMB,2023,3.967077,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,2.992084
511663,Urban population growth (annual %),Zimbabwe,ZWE,2023,2.466992,GDP per capita (current US$),NY.GDP.PCAP.CD,1592.416574


In [165]:
# Save the cleaned and merged dataset to CSV
merged_df.to_csv('Resources/cleaned_population_fertility_gdp_data_1960_2024.csv', index=False)

In [167]:
# Show the first few rows of the cleaned dataset
merged_df.head()

Unnamed: 0,Series Name_x,Country Name,Country Code,Year,Fertility Rate,Series Name_y,Series Code,GDP
2,"Fertility rate, total (births per woman)",Africa Eastern and Southern,AFE,1960,6.724125,GDP per capita (current US$),NY.GDP.PCAP.CD,162.342517
4,"Fertility rate, total (births per woman)",Africa Western and Central,AFW,1960,6.458448,GDP per capita (current US$),NY.GDP.PCAP.CD,122.193931
8,"Fertility rate, total (births per woman)",Algeria,DZA,1960,7.503,GDP per capita (current US$),NY.GDP.PCAP.CD,239.033006
26,"Fertility rate, total (births per woman)",Australia,AUS,1960,3.453,GDP per capita (current US$),NY.GDP.PCAP.CD,1810.597443
28,"Fertility rate, total (births per woman)",Austria,AUT,1960,2.69,GDP per capita (current US$),NY.GDP.PCAP.CD,943.6108


In [169]:
# Filter the dataset for the years 2014 to 2024
filtered_df = merged_df[merged_df['Year'].between(2014, 2024)]

In [175]:
# Save the filtered data to a CSV and JSONfile
filtered_df.to_csv('Resources/filtered_population_fertility_gdp_2014_2024.csv', index=False)
filtered_df.to_json('Resources/filtered_population_fertility_gdp_2014_2024.json', orient='records', lines=True)

In [173]:
# Show the first few rows of the filtered data to verify
filtered_df.tail()

Unnamed: 0,Series Name_x,Country Name,Country Code,Year,Fertility Rate,Series Name_y,Series Code,GDP
511659,Urban population growth (annual %),"Yemen, Rep.",YEM,2023,3.838151,GDP per capita (current US$),NY.GDP.PCAP.CD,533.367123
511661,Urban population growth (annual %),Zambia,ZMB,2023,3.967077,GDP per capita (current US$),NY.GDP.PCAP.CD,1369.129365
511662,Urban population growth (annual %),Zambia,ZMB,2023,3.967077,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,2.992084
511663,Urban population growth (annual %),Zimbabwe,ZWE,2023,2.466992,GDP per capita (current US$),NY.GDP.PCAP.CD,1592.416574
511664,Urban population growth (annual %),Zimbabwe,ZWE,2023,2.466992,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,2.783839
