In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# import data and sort it by country in alphabetical order then year (ascending) 
df = pd.read_csv('futurePopData.csv')
df['YEARS'] = df['YEARS'].astype(int)

# Sort by Country first, then by Year
countries = df.sort_values(by=['COUNTRY', 'YEARS'])
#countries = countries[['COUNTRY', 'YEARS', 'HDI', 'FERTILITY RATE', 'FEMALE LITERACY', 'GDP PER CAPITA', 'CHILD MORTALITY', 'MEAN AGE AT FIRST BIRTH']]

print(countries)

         COUNTRY    HDI  FERTILITY RATE  FEMALE LITERACY  \
579  Afghanistan  0.501             NaN            0.266   
386  Afghanistan  0.486            4.90            0.266   
193  Afghanistan  0.495            4.62            0.266   
0    Afghanistan  0.496            4.80            0.266   
580      Albania  0.794             NaN            0.983   
..           ...    ...             ...              ...   
191       Zambia  0.595            4.10            0.843   
771     Zimbabwe  0.582             NaN            0.912   
578     Zimbabwe  0.581            3.80            0.912   
385     Zimbabwe  0.594            3.89            0.912   
192     Zimbabwe  0.598            3.70            0.912   

     FEMALE LABOR PARTICIPATION  GDP PER CAPITA  URBANIZATION RATE  \
579                         NaN      510.787063                NaN   
386                         NaN      356.496214                NaN   
193                         NaN      510.787063                NaN   

In [None]:
#linear comparison between fertility rate and other columns
countries.select_dtypes(include='number').corr()['FERTILITY RATE']

HDI                          -0.835419
FERTILITY RATE                1.000000
FEMALE LITERACY              -0.784020
FEMALE LABOR PARTICIPATION   -0.003257
GDP PER CAPITA               -0.434678
URBANIZATION RATE            -0.501123
CHILD MORTALITY               0.763165
MEAN AGE AT FIRST BIRTH      -0.651693
YEARS                        -0.017840
Name: FERTILITY RATE, dtype: float64

In [13]:
#Random Forest
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = countries.copy()  
df['FertilityRate_LastYear'] = df.groupby('COUNTRY')['FERTILITY RATE'].shift(1)

features = ['HDI', 'FertilityRate_LastYear', 'FEMALE LITERACY',
            'GDP PER CAPITA', 'CHILD MORTALITY', 'MEAN AGE AT FIRST BIRTH']

train_df = df[df['YEARS'] < 2023].copy()
test_df = df[df['YEARS'] == 2023].copy()

train_df.dropna(subset=features + ['FERTILITY RATE'], inplace=True)
test_df.dropna(subset=features + ['FERTILITY RATE'], inplace=True)

X_train = train_df[features]
y_train = train_df['FERTILITY RATE']
X_test = test_df[features]
y_test = test_df['FERTILITY RATE']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print("MSE on 2023 data:", mean_squared_error(y_test, y_pred))

predict_2024 = df[df['YEARS'] == 2023].copy()
predict_2024['FertilityRate_LastYear'] = predict_2024['FERTILITY RATE']
X_2024 = predict_2024[features].dropna()
X_2024_scaled = scaler.transform(X_2024)
fertility_2024_preds = model.predict(X_2024_scaled)

predict_2024 = predict_2024.loc[X_2024.index]
predict_2024['Predicted Fertility Rate 2024'] = fertility_2024_preds
pd.set_option('display.max_rows', None)

predict_2024['COUNTRY'] = predict_2024['COUNTRY'].str.strip()
print(predict_2024[['COUNTRY', 'Predicted Fertility Rate 2024']])
predict_2024[['COUNTRY', 'Predicted Fertility Rate 2024']].to_csv('random_forest.csv', index=False)

MSE on 2023 data: 0.28699301279567396
                            COUNTRY  Predicted Fertility Rate 2024
0                       Afghanistan                       4.655600
1                           Albania                       1.675300
2                           Algeria                       2.488700
4                            Angola                       5.087100
6                         Argentina                       1.969600
7                           Armenia                       2.118800
8                         Australia                       1.806800
9                           Austria                       1.605800
10                       Azerbaijan                       1.805200
11                          Bahamas                       1.871700
12                          Bahrain                       1.887200
13                       Bangladesh                       2.175100
15                          Belarus                       1.504200
16                      

In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

df = countries.copy()
df['FertilityRate_LastYear'] = df.groupby('COUNTRY')['FERTILITY RATE'].shift(1)

features = ['HDI', 'FertilityRate_LastYear', 'FEMALE LITERACY',
            'GDP PER CAPITA', 'CHILD MORTALITY', 'MEAN AGE AT FIRST BIRTH']

train_df = df[df['YEARS'] < 2023].copy()
test_df = df[df['YEARS'] == 2023].copy()

train_df.dropna(subset=features + ['FERTILITY RATE'], inplace=True)
test_df.dropna(subset=features + ['FERTILITY RATE'], inplace=True)

X_train = train_df[features]
y_train = train_df['FERTILITY RATE']
X_test = test_df[features]
y_test = test_df['FERTILITY RATE']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = Sequential([
    Dense(32, input_dim=len(features), activation='relu'),  # One hidden layer with 32 neurons
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train_scaled, y_train, epochs=100, batch_size=16, verbose=1)

y_pred = model.predict(X_test_scaled).flatten()
print("MSE on 2023 data:", mean_squared_error(y_test, y_pred))

predict_2024 = df[df['YEARS'] == 2023].copy()
predict_2024['FertilityRate_LastYear'] = predict_2024['FERTILITY RATE']
X_2024 = predict_2024[features].dropna()

X_2024_scaled = scaler.transform(X_2024)
fertility_2024_preds = model.predict(X_2024_scaled).flatten()

predict_2024 = predict_2024.loc[X_2024.index]
predict_2024['Predicted Fertility Rate 2024'] = fertility_2024_preds

print(predict_2024[['COUNTRY', 'Predicted Fertility Rate 2024']])
predict_2024[['COUNTRY', 'Predicted Fertility Rate 2024']].to_csv('neural_network.csv', index=False)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 9.0531
Epoch 2/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 8.6795
Epoch 3/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 8.7812 
Epoch 4/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 8.2585 
Epoch 5/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 7.3065 
Epoch 6/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 5.4518
Epoch 7/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 4.8918 
Epoch 8/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 4.3280
Epoch 9/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 4.0710 
Epoch 10/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 3.3948 
Epoch 11/100
[1m9/9

In [33]:
actual_df = pd.read_csv('perdictionData.csv')
predicted_ran_df = pd.read_csv('randomForest.csv')
predicted_nn_df = pd.read_csv('neuralNetwork.csv')

merged_ran = pd.merge(predicted_ran_df, actual_df, how='inner', on='COUNTRY', suffixes=('_predicted','_actual'))
merged_nn = pd.merge(predicted_nn_df, actual_df, how='inner', on='COUNTRY', suffixes=('_predicted','_actual'))

print(merged_ran)

                      COUNTRY  Predicted Fertility Rate 2024  \
0                 Afghanistan                       4.655600   
1                     Albania                       1.675300   
2                     Algeria                       2.488700   
3                      Angola                       5.087100   
4                   Argentina                       1.969600   
5                     Armenia                       2.118800   
6                   Australia                       1.806800   
7                     Austria                       1.605800   
8                  Azerbaijan                       1.805200   
9                     Bahamas                       1.871700   
10                    Bahrain                       1.887200   
11                 Bangladesh                       2.175100   
12                    Belarus                       1.504200   
13                    Belgium                       1.780500   
14                     Belize           

In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

actual = merged_ran['TotalFertilityRate_EstFertilityRate_num_2024']
predicted = merged_ran['Predicted Fertility Rate 2024']

mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)
r2 = r2_score(actual, predicted)

print("Random Forest")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Random Forest
Mean Absolute Error (MAE): 0.22452953586430371
Mean Squared Error (MSE): 0.1315412205064121
R-squared (R²): 0.8955831783322845


In [45]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

actual = merged_nn['TotalFertilityRate_EstFertilityRate_num_2024']
predicted = merged_nn['Predicted Fertility Rate 2024']

mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)
r2 = r2_score(actual, predicted)

print("Neural Network")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Neural Network
Mean Absolute Error (MAE): 0.31142658670886075
Mean Squared Error (MSE): 0.18272574025589247
R-squared (R²): 0.8549531397006404
