In [17]:
import pandas as pd

# Load the datasets
fertility_df = pd.read_csv("Resources/Population and Ferility Data by Country.csv")
gdp_df = pd.read_csv("Resources/GDP by Country_Data.csv")

In [18]:
#Check for missing values in both datasets
fertility_missing = fertility_df.isnull().sum()
gdp_missing = gdp_df.isnull().sum()

print("Fertility Data Missing Values:")
print(fertility_missing)
print("\nGDP Data Missing Values:")
print(gdp_missing)

Fertility Data Missing Values:
Series Name      3
Series Code      5
Country Name     5
Country Code     5
1960 [YR1960]    5
                ..
2019 [YR2019]    5
2020 [YR2020]    5
2021 [YR2021]    5
2022 [YR2022]    5
2023 [YR2023]    5
Length: 68, dtype: int64

GDP Data Missing Values:
Country Name     3
Country Code     5
Series Name      5
Series Code      5
1960 [YR1960]    5
                ..
2019 [YR2019]    5
2020 [YR2020]    5
2021 [YR2021]    5
2022 [YR2022]    5
2023 [YR2023]    5
Length: 68, dtype: int64


In [19]:
# Handle missing values (for simplicity, we can fill them with a placeholder or use interpolation)
# For fertility data, we can forward-fill
fertility_df = fertility_df.ffill(axis=1)  # Use ffill for forward-filling

# For GDP, we can fill missing values with a placeholder like 0
gdp_df = gdp_df.fillna(0)

In [20]:
# Ensure year columns are integers (Some columns might have extra spaces or different formats)
fertility_df.columns = [col.strip() for col in fertility_df.columns]  # Remove any extra spaces
gdp_df.columns = [col.strip() for col in gdp_df.columns]  # Remove any extra spaces

In [21]:
# Reshape the data (pivot the data so we can have years as columns)
fertility_long = pd.melt(fertility_df, id_vars=["Series Name", "Series Code", "Country Name", "Country Code"], 
                         var_name="Year", value_name="Fertility Rate")
gdp_long = pd.melt(gdp_df, id_vars=["Country Name", "Country Code", "Series Name", "Series Code"], 
                   var_name="Year", value_name="GDP")

In [22]:
# Clean the Year column (remove non-integer values)
fertility_long['Year'] = pd.to_numeric(fertility_long['Year'], errors='coerce')
gdp_long['Year'] = pd.to_numeric(gdp_long['Year'], errors='coerce')

In [23]:
# Merge the datasets by Country Name, Country Code, and Year
merged_df = pd.merge(fertility_long, gdp_long, on=["Country Name", "Country Code", "Year"], how="inner")

MemoryError: Unable to allocate 1.22 GiB for an array with shape (5, 32686080) and data type object

In [9]:
# Ensure the data types are correct (e.g., convert Year to integer, GDP to numeric)
merged_df['Year'] = merged_df['Year'].astype(int)
merged_df['Fertility Rate'] = pd.to_numeric(merged_df['Fertility Rate'], errors='coerce')
merged_df['GDP'] = pd.to_numeric(merged_df['GDP'], errors='coerce')

In [10]:
# Handle any remaining missing values in the merged dataset (if needed)
merged_df = merged_df.dropna(subset=['Fertility Rate', 'GDP'])

In [12]:
# Save the cleaned and merged dataset to CSV
merged_df.to_csv('Resources/cleaned_population_fertility_gdp_data_2014_2024.csv', index=False)

In [11]:
# Show the first few rows of the cleaned dataset
merged_df.head()

Unnamed: 0,Series Name_x,Series Code_x,Country Name,Country Code,Year,Fertility Rate,Series Name_y,Series Code_y,GDP
2,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Africa Eastern and Southern,AFE,1960,6.724125,GDP per capita (current US$),NY.GDP.PCAP.CD,162.342517
4,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Africa Western and Central,AFW,1960,6.458448,GDP per capita (current US$),NY.GDP.PCAP.CD,122.193931
8,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Algeria,DZA,1960,7.503,GDP per capita (current US$),NY.GDP.PCAP.CD,239.033006
26,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Australia,AUS,1960,3.453,GDP per capita (current US$),NY.GDP.PCAP.CD,1810.597443
28,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Austria,AUT,1960,2.69,GDP per capita (current US$),NY.GDP.PCAP.CD,943.6108


In [13]:
# Filter the dataset for the years 2014 to 2024
filtered_df = merged_df[merged_df['Year'].between(2014, 2024)]

In [14]:
# Save the filtered data to a CSV and JSONfile
filtered_df.to_csv('Resources/filtered_population_fertility_gdp_2014_2024.csv', index=False)
filtered_df.to_json('Resources/filtered_population_fertility_gdp_2014_2024.json', orient='records', lines=True)

In [16]:
# Show the first few rows of the filtered data to verify
filtered_df.head()

Unnamed: 0,Series Name_x,Series Code_x,Country Name,Country Code,Year,Fertility Rate,Series Name_y,Series Code_y,GDP
430920,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Afghanistan,AFG,2014,5.56,GDP per capita (current US$),NY.GDP.PCAP.CD,626.512929
430921,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Afghanistan,AFG,2014,5.56,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,-0.964803
430922,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Africa Eastern and Southern,AFE,2014,4.739861,GDP per capita (current US$),NY.GDP.PCAP.CD,1678.55361
430923,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Africa Eastern and Southern,AFE,2014,4.739861,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,1.17557
430924,"Fertility rate, total (births per woman)",SP.DYN.TFRT.IN,Africa Western and Central,AFW,2014,5.437493,GDP per capita (current US$),NY.GDP.PCAP.CD,2248.518426
