In [2]:
%pip install scikit-learn statsmodels prophet pandas numpy matplotlib seaborn wbdata -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wbdata
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

# Try importing Prophet
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    PROPHET_AVAILABLE = False
    print("Prophet not installed. Install with: pip install prophet")

# Try importing ARIMA
try:
    from statsmodels.tsa.arima.model import ARIMA
    ARIMA_AVAILABLE = True
except ImportError:
    ARIMA_AVAILABLE = False
    print("Statsmodels not installed. Install with: pip install statsmodels")

print("Libraries imported successfully!")
print(f"Prophet available: {PROPHET_AVAILABLE}")
print(f"ARIMA available: {ARIMA_AVAILABLE}")

Note: you may need to restart the kernel to use updated packages.
Libraries imported successfully!
Prophet available: True
ARIMA available: True



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Fetch BRICS FLFP data
BRICS_COUNTRIES = {
    'BRA': 'Brazil',
    'RUS': 'Russia', 
    'IND': 'India',
    'CHN': 'China',
    'ZAF': 'South Africa'
}

FLFP_INDICATOR = 'SP.FEM.LABS.ZS'  # Female labor force participation (% of female population ages 15+)
START_YEAR = 2000
END_YEAR = 2023

print("Fetching Female Labor Force Participation data for BRICS countries...")

# Try batch request first; if World Bank API rejects the batch request use per-country fallback.
try:
    brics_data = wbdata.get_dataframe(
        {FLFP_INDICATOR: 'FLFP'},
        country=list(BRICS_COUNTRIES.keys()),
        date=(str(START_YEAR), str(END_YEAR))
    )
    brics_data = brics_data.reset_index()
except Exception as e:
    # Fallback: fetch per-country to avoid invalid-parameter errors from batch queries
    print(f"Batch request failed: {e}")
    print("Falling back to per-country requests...")
    frames = []
    for code, cname in BRICS_COUNTRIES.items():
        try:
            import time
            time.sleep(0.5)  # Add delay to avoid rate limiting
            # get_series returns a Series indexed by dates -- safer for single-country calls
            s = wbdata.get_series(FLFP_INDICATOR, country=code, date=(str(START_YEAR), str(END_YEAR)))
            if isinstance(s, pd.Series) and len(s) > 0:
                dfc = s.reset_index()
                dfc.columns = ['date', 'FLFP']
                dfc['country'] = code
                frames.append(dfc)
                print(f"  ✓ Successfully fetched {cname} ({code}): {len(dfc)} records")
            else:
                print(f"  Warning: no data returned for {code} ({cname})")
        except Exception as e2:
            print(f"  ✗ Error fetching {cname} ({code}): {e2}")
    
    if frames:
        brics_data = pd.concat(frames, ignore_index=True)
    else:
        raise Exception("No BRICS data could be fetched")

# Map country codes to human-readable country names (BRICS_COUNTRIES maps codes->names)
brics_data['country_name'] = brics_data['country'].map(BRICS_COUNTRIES)

# Convert date to year
if 'date' in brics_data.columns:
    brics_data['year'] = pd.to_datetime(brics_data['date'], errors='coerce').dt.year
    # If conversion failed (e.g., date not a datetime), try extracting 4-digit year
    if brics_data['year'].isna().all():
        brics_data['year'] = brics_data['date'].astype(str).str.extract('(\d{4})').astype(float)
        brics_data['year'] = brics_data['year'].astype('Int64').astype(int)

# Ensure numeric FLFP and drop missing
brics_data['FLFP'] = pd.to_numeric(brics_data['FLFP'], errors='coerce')
brics_data = brics_data.dropna(subset=['FLFP', 'year'])
brics_data = brics_data.dropna(subset=['country_name'])
brics_data = brics_data.sort_values(['country_name', 'year'])

print(f"Data loaded successfully!")
print(f"Years available: {brics_data['year'].min()} - {brics_data['year'].max()}")
print(f"Countries: {brics_data['country_name'].unique()}")
print(f"\nData shape: {brics_data.shape}")
print(brics_data.head(10))
# Convert date to year
if 'date' in brics_data.columns:
    brics_data['year'] = pd.to_datetime(brics_data['date'], errors='coerce').dt.year
    # If conversion failed (e.g., date not a datetime), try extracting 4-digit year
    if brics_data['year'].isna().all():
        brics_data['year'] = brics_data['date'].astype(str).str.extract('(\d{4})').astype(float)
        brics_data['year'] = brics_data['year'].astype('Int64').astype(int)

# Ensure numeric FLFP and drop missing
brics_data['FLFP'] = pd.to_numeric(brics_data['FLFP'], errors='coerce')
brics_data = brics_data.dropna(subset=['FLFP', 'year', 'country_name'])
brics_data = brics_data.sort_values(['country_name', 'year'])

print(f"Data fetched successfully!")
print(f"Years available: {brics_data['year'].min()} - {brics_data['year'].max()}")
print(f"Countries: {brics_data['country_name'].unique()}")
print(f"\nData shape: {brics_data.shape}")
print(brics_data.head(10))

In [None]:
# Define time periods (80/10/10 split)
TRAIN_START, TRAIN_END = 2000, 2019
VAL_START, VAL_END = 2020, 2021
TEST_START, TEST_END = 2022, 2023

# Select a country for detailed analysis (Brazil)
country_to_model = 'Brazil'
country_data = brics_data[brics_data['country_name'] == country_to_model].copy()
country_data = country_data.sort_values('year')

print(f"Modeling {country_to_model}")
print(f"Total data points: {len(country_data)}")

# Split data by time period
train_data = country_data[(country_data['year'] >= TRAIN_START) & (country_data['year'] <= TRAIN_END)]
val_data = country_data[(country_data['year'] >= VAL_START) & (country_data['year'] <= VAL_END)]
test_data = country_data[(country_data['year'] >= TEST_START) & (country_data['year'] <= TEST_END)]

print(f"\nTrain set: {len(train_data)} samples ({TRAIN_START}-{TRAIN_END})")
print(f"Validation set: {len(val_data)} samples ({VAL_START}-{VAL_END})")
print(f"Test set: {len(test_data)} samples ({TEST_START}-{TEST_END})")

# Prepare features and targets for linear regression
X_train = train_data['year'].values.reshape(-1, 1)
y_train = train_data['FLFP'].values

X_val = val_data['year'].values.reshape(-1, 1)
y_val = val_data['FLFP'].values

X_test = test_data['year'].values.reshape(-1, 1)
y_test = test_data['FLFP'].values

In [None]:
# Train Linear Regression
print("\n" + "="*60)
print("LINEAR REGRESSION MODEL")
print("="*60)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_train_pred = lr_model.predict(X_train)
y_val_pred = lr_model.predict(X_val)
y_test_pred = lr_model.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nModel Parameters:")
print(f"  Slope: {lr_model.coef_[0]:.4f}")
print(f"  Intercept: {lr_model.intercept_:.4f}")

print(f"\nTrain Metrics:")
print(f"  RMSE: {train_rmse:.4f}%")
print(f"  MAE:  {train_mae:.4f}%")
print(f"  R²:   {train_r2:.4f}")

print(f"\nValidation Metrics:")
print(f"  RMSE: {val_rmse:.4f}%")
print(f"  MAE:  {val_mae:.4f}%")
print(f"  R²:   {val_r2:.4f}")

print(f"\nTest Metrics:")
print(f"  RMSE: {test_rmse:.4f}%")
print(f"  MAE:  {test_mae:.4f}%")
print(f"  R²:   {test_r2:.4f}")

In [None]:
# Visualize Linear Regression results
plt.figure(figsize=(14, 8))

# Plot training data
plt.scatter(X_train, y_train, color='blue', s=80, alpha=0.7, label='Train Data')
plt.scatter(X_val, y_val, color='green', s=80, alpha=0.7, label='Validation Data')
plt.scatter(X_test, y_test, color='red', s=80, alpha=0.7, label='Test Data')

# Plot predictions
all_years = np.arange(TRAIN_START, TEST_END + 1).reshape(-1, 1)
all_pred = lr_model.predict(all_years)
plt.plot(all_years, all_pred, color='black', linewidth=2.5, label='Linear Regression Fit')

# Plot actual line through all data
plt.plot(country_data['year'], country_data['FLFP'], 
         color='orange', linewidth=2, linestyle='--', alpha=0.7, label='Actual Data')

plt.xlabel('Year', fontsize=12)
plt.ylabel('Female Labor Force Participation (%)', fontsize=12)
plt.title(f'Linear Regression: {country_to_model} FLFP Forecast', fontsize=14, fontweight='bold')
plt.legend(fontsize=10, loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Linear Regression visualization complete!")

In [None]:
# Train Polynomial Regression (degree 2)
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION MODEL (Degree 2)")
print("="*60)

poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_val_poly = poly_features.transform(X_val)
X_test_poly = poly_features.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Make predictions
y_train_poly_pred = poly_model.predict(X_train_poly)
y_val_poly_pred = poly_model.predict(X_val_poly)
y_test_poly_pred = poly_model.predict(X_test_poly)

# Calculate metrics
train_poly_rmse = np.sqrt(mean_squared_error(y_train, y_train_poly_pred))
val_poly_rmse = np.sqrt(mean_squared_error(y_val, y_val_poly_pred))
test_poly_rmse = np.sqrt(mean_squared_error(y_test, y_test_poly_pred))

train_poly_r2 = r2_score(y_train, y_train_poly_pred)
val_poly_r2 = r2_score(y_val, y_val_poly_pred)
test_poly_r2 = r2_score(y_test, y_test_poly_pred)

print(f"\nTrain Metrics:")
print(f"  RMSE: {train_poly_rmse:.4f}%")
print(f"  R²:   {train_poly_r2:.4f}")

print(f"\nValidation Metrics:")
print(f"  RMSE: {val_poly_rmse:.4f}%")
print(f"  R²:   {val_poly_r2:.4f}")

print(f"\nTest Metrics:")
print(f"  RMSE: {test_poly_rmse:.4f}%")
print(f"  R²:   {test_poly_r2:.4f}")

# Visualize both linear and polynomial models
plt.figure(figsize=(14, 8))

plt.scatter(X_train, y_train, color='blue', s=80, alpha=0.7, label='Train Data')
plt.scatter(X_val, y_val, color='green', s=80, alpha=0.7, label='Validation Data')
plt.scatter(X_test, y_test, color='red', s=80, alpha=0.7, label='Test Data')

all_years_poly = poly_features.transform(all_years)
all_pred_poly = poly_model.predict(all_years_poly)
plt.plot(all_years, all_pred_poly, color='purple', linewidth=2.5, label='Polynomial (degree 2) Fit')
plt.plot(all_years, all_pred, color='black', linewidth=2, linestyle='--', alpha=0.7, label='Linear Fit')

plt.xlabel('Year', fontsize=12)
plt.ylabel('Female Labor Force Participation (%)', fontsize=12)
plt.title(f'Linear vs Polynomial Regression: {country_to_model} FLFP', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ARIMA Model
if ARIMA_AVAILABLE:
    print("\n" + "="*60)
    print("ARIMA MODEL")
    print("="*60)
    
    train_timeseries = train_data['FLFP'].values
    
    try:
        arima_model = ARIMA(train_timeseries, order=(1, 1, 1))
        arima_fitted = arima_model.fit()
        
        print(f"\nARIMA(1,1,1) Summary:")
        print(arima_fitted.summary())
        
        total_forecast_steps = len(val_data) + len(test_data)
        arima_forecast = arima_fitted.get_forecast(steps=total_forecast_steps)
        arima_pred = arima_forecast.predicted_mean.values
        
        y_val_arima_pred = arima_pred[:len(val_data)]
        y_test_arima_pred = arima_pred[len(val_data):]
        
        val_arima_rmse = np.sqrt(mean_squared_error(y_val, y_val_arima_pred))
        test_arima_rmse = np.sqrt(mean_squared_error(y_test, y_test_arima_pred))
        
        val_arima_r2 = r2_score(y_val, y_val_arima_pred)
        test_arima_r2 = r2_score(y_test, y_test_arima_pred)
        
        print(f"\nValidation Metrics:")
        print(f"  RMSE: {val_arima_rmse:.4f}%")
        print(f"  R²:   {val_arima_r2:.4f}")
        
        print(f"\nTest Metrics:")
        print(f"  RMSE: {test_arima_rmse:.4f}%")
        print(f"  R²:   {test_arima_r2:.4f}")
        
        # Visualize ARIMA results
        plt.figure(figsize=(14, 8))
        
        plt.plot(country_data['year'], country_data['FLFP'], 
                color='blue', linewidth=2.5, marker='o', label='Actual Data', markersize=6)
        
        fitted_values = arima_fitted.fittedvalues
        plt.plot(train_data['year'], fitted_values, 
                color='green', linewidth=2, label='ARIMA Fitted (Train)', linestyle='--')
        
        forecast_years = np.concatenate([val_data['year'].values, test_data['year'].values])
        plt.plot(forecast_years, arima_pred, 
                color='red', linewidth=2.5, marker='s', label='ARIMA Forecast', markersize=6)
        
        plt.axvspan(TRAIN_START, TRAIN_END, alpha=0.1, color='blue', label='Train Period')
        plt.axvspan(VAL_START, VAL_END, alpha=0.1, color='green', label='Validation Period')
        plt.axvspan(TEST_START, TEST_END, alpha=0.1, color='red', label='Test Period')
        
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Female Labor Force Participation (%)', fontsize=12)
        plt.title(f'ARIMA(1,1,1): {country_to_model} FLFP Forecast', fontsize=14, fontweight='bold')
        plt.legend(fontsize=10)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error fitting ARIMA: {e}")
        print("ARIMA model failed.")
else:
    print("ARIMA not available.")

In [None]:
# Prophet Model
if PROPHET_AVAILABLE:
    print("\n" + "="*60)
    print("PROPHET MODEL")
    print("="*60)
    
    try:
        prophet_data = country_data[['year', 'FLFP']].copy()
        prophet_data.columns = ['ds', 'y']
        prophet_data['ds'] = pd.to_datetime(prophet_data['ds'], format='%Y')
        
        prophet_model = Prophet(interval_width=0.95, yearly_seasonality=False)
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            prophet_model.fit(prophet_data)
        
        future_years = pd.date_range(
            start=pd.Timestamp(year=TRAIN_END, month=1, day=1),
            end=pd.Timestamp(year=TEST_END, month=12, day=31),
            freq='YS'
        )
        future_df = pd.DataFrame({'ds': future_years})
        
        forecast = prophet_model.predict(future_df)
        forecast['year'] = forecast['ds'].dt.year
        
        val_forecast = forecast[(forecast['year'] >= VAL_START) & (forecast['year'] <= VAL_END)]
        test_forecast = forecast[(forecast['year'] >= TEST_START) & (forecast['year'] <= TEST_END)]
        
        y_val_prophet_pred = val_forecast['yhat'].values
        y_test_prophet_pred = test_forecast['yhat'].values
        
        val_prophet_rmse = np.sqrt(mean_squared_error(y_val, y_val_prophet_pred))
        test_prophet_rmse = np.sqrt(mean_squared_error(y_test, y_test_prophet_pred))
        
        val_prophet_r2 = r2_score(y_val, y_val_prophet_pred)
        test_prophet_r2 = r2_score(y_test, y_test_prophet_pred)
        
        print(f"\nValidation Metrics:")
        print(f"  RMSE: {val_prophet_rmse:.4f}%")
        print(f"  R²:   {val_prophet_r2:.4f}")
        
        print(f"\nTest Metrics:")
        print(f"  RMSE: {test_prophet_rmse:.4f}%")
        print(f"  R²:   {test_prophet_r2:.4f}")
        
        fig, ax = plt.subplots(figsize=(14, 8))
        
        ax.plot(country_data['year'], country_data['FLFP'], 
               color='blue', linewidth=2.5, marker='o', label='Actual Data', markersize=6)
        
        ax.plot(forecast['year'], forecast['yhat'], 
               color='red', linewidth=2.5, marker='s', label='Prophet Forecast', markersize=6)
        
        ax.fill_between(forecast['year'], 
                        forecast['yhat_lower'], 
                        forecast['yhat_upper'], 
                        color='red', alpha=0.2, label='95% Confidence Interval')
        
        ax.axvspan(TRAIN_START, TRAIN_END, alpha=0.1, color='blue', label='Train Period')
        ax.axvspan(VAL_START, VAL_END, alpha=0.1, color='green', label='Validation Period')
        ax.axvspan(TEST_START, TEST_END, alpha=0.1, color='red', label='Test Period')
        
        ax.set_xlabel('Year', fontsize=12)
        ax.set_ylabel('Female Labor Force Participation (%)', fontsize=12)
        ax.set_title(f'Prophet: {country_to_model} FLFP Forecast', fontsize=14, fontweight='bold')
        ax.legend(fontsize=10, loc='upper left')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error fitting Prophet: {e}")
else:
    print("Prophet not available.")

In [None]:
# Model Comparison Summary
print("\n" + "="*80)
print("MODEL PERFORMANCE COMPARISON - TEST SET")
print("="*80)

comparison_data = {
    'Model': ['Linear Regression', 'Polynomial (degree 2)'],
    'Test RMSE': [f"{test_rmse:.4f}%", f"{test_poly_rmse:.4f}%"],
    'Test R²': [f"{test_r2:.4f}", f"{test_poly_r2:.4f}"]
}

# Add ARIMA if available
if ARIMA_AVAILABLE and 'test_arima_rmse' in locals():
    comparison_data['Model'].append('ARIMA(1,1,1)')
    comparison_data['Test RMSE'].append(f"{test_arima_rmse:.4f}%")
    comparison_data['Test R²'].append(f"{test_arima_r2:.4f}")

# Add Prophet if available
if PROPHET_AVAILABLE and 'test_prophet_rmse' in locals():
    comparison_data['Model'].append('Prophet')
    comparison_data['Test RMSE'].append(f"{test_prophet_rmse:.4f}%")
    comparison_data['Test R²'].append(f"{test_prophet_r2:.4f}")

comparison_df = pd.DataFrame(comparison_data)
print("\n" + comparison_df.to_string(index=False))

# Visual comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = comparison_data['Model']
rmse_values = [float(x.replace('%', '')) for x in comparison_data['Test RMSE']]
r2_values = [float(x) for x in comparison_data['Test R²']]

# RMSE comparison
axes[0].bar(models, rmse_values, color=['blue', 'purple', 'orange', 'green'][:len(models)])
axes[0].set_ylabel('RMSE (%)', fontsize=11)
axes[0].set_title('Test RMSE Comparison', fontsize=12, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(rmse_values):
    axes[0].text(i, v + 0.1, f'{v:.4f}%', ha='center', va='bottom', fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# R² comparison
axes[1].bar(models, r2_values, color=['blue', 'purple', 'orange', 'green'][:len(models)])
axes[1].set_ylabel('R² Score', fontsize=11)
axes[1].set_title('Test R² Comparison', fontsize=12, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_ylim([0, 1])
for i, v in enumerate(r2_values):
    axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("RECOMMENDATIONS:")
print("="*80)
print("• Linear Regression: Simple baseline, fast, but may underfit")
print("• Polynomial Regression: Better for capturing non-linear trends in FLFP")
print("• ARIMA: Good for univariate time series with clear patterns")
print("• Prophet: Best for trends + seasonality, very interpretable")
print("\nFor FLFP forecasting, Polynomial or Prophet models typically perform best.")
print("="*80)