In [1]:
pip install pandas matplotlib scikit-learn pycountry-convert seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
import pycountry_convert as pc
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Create output directory
os.makedirs('photos', exist_ok=True)

def load_data():
    # Load dataset
    df = pd.read_excel("dataset/entire-world-economic-outlook-database.xlsx")
    
    # Convert columns to appropriate types
    df.columns = [str(col).strip() for col in df.columns]
    
    # Identify year columns dynamically
    year_cols = []
    for col in df.columns:
        try:
            year = int(col)
            if 1980 <= year <= 2025:
                year_cols.append(col)
        except ValueError:
            continue
    
    # Clean numeric columns
    for col in year_cols:
        df[col] = pd.to_numeric(
            df[col].astype(str).str.replace(',', ''), 
            errors='coerce'
        ).fillna(0)
    
    return df, year_cols

def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        return pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        return "Unknown"

def perform_analysis(df, year_cols):
    # Configuration
    analysis_years = sorted(year_cols, key=lambda x: int(x))[-11:]  # Last 11 available years
    start_year = analysis_years[0]
    end_year = analysis_years[-1]
    
    # Countries to analyze
    top_economies = [
        'United States', 'China', 'Japan', 'Germany', 'India',
        'United Kingdom', 'France', 'Italy', 'Canada', 'South Korea',
        'Russia', 'Brazil', 'Australia', 'Spain', 'Mexico'
    ]
    
    african_countries = [
        'Kenya', 'Nigeria', 'South Africa', 'Egypt', 'Ethiopia',
        'Ghana', 'Angola', 'Tanzania', "Côte d'Ivoire", 'Morocco'
    ]

    # 1. GDP Growth Analysis
    gdp_data = df[df["WEO Subject Code"] == "NGDPRPPPPC"].copy()
    if not gdp_data.empty:
        gdp_data['Growth'] = gdp_data[end_year] - gdp_data[start_year]
        growth_data = pd.concat([
            gdp_data.sort_values('Growth', ascending=False).head(15),
            gdp_data[gdp_data['Country'].isin(african_countries)]
        ]).drop_duplicates()

    # 2. Growth Comparison Plot
    plt.figure(figsize=(18, 10))
    valid_countries = []
    
    for country_list, style in [(top_economies, '-'), (african_countries, '--')]:
        for country in country_list:
            country_data = df[
                (df['Country'] == country) & 
                (df['WEO Subject Code'] == "NGDPRPPPPC")
            ]
            if not country_data.empty:
                valid_years = [yr for yr in analysis_years if yr in country_data]
                values = country_data[valid_years].values[0]
                plt.plot(
                    [int(yr) for yr in valid_years], 
                    values,
                    label=f"{country} ({'Africa' if style == '--' else 'Global'})",
                    linestyle=style,
                    linewidth=2
                )
                valid_countries.append(country)
    
    plt.title(f'GDP Per Capita Growth ({start_year}-{end_year})')
    plt.xlabel('Year')
    plt.ylabel('GDP Per Capita (PPP)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('photos/growth_comparison.png')
    plt.close()

    # 3. Population Analysis
    population_data = df[
        df['WEO Subject Code'].isin(['LP', 'LUR']) & 
        df['Country'].isin(top_economies + african_countries)
    ]
    
    if not population_data.empty:
        plt.figure(figsize=(20, 12))
        pivot_df = population_data.pivot_table(
            index=['Country', 'WEO Subject Code'],
            values=end_year,
            aggfunc='mean'
        ).unstack()
        sns.heatmap(pivot_df, annot=True, fmt=".1f", cmap="YlGnBu")
        plt.title('Population and Unemployment Indicators')
        plt.tight_layout()
        plt.savefig('photos/population_analysis.png')
        plt.close()

    # 4. Economic Clustering
    economic_metrics = ['NGDPRPPPPC', 'PCPIPCH', 'LUR']
    cluster_data = df[
        df['WEO Subject Code'].isin(economic_metrics) & 
        df['Country'].isin(top_economies + african_countries)
    ]
    
    if not cluster_data.empty:
        pivot_df = cluster_data.pivot_table(
            index='Country',
            columns='WEO Subject Code',
            values=end_year
        ).dropna()
        
        if not pivot_df.empty:
            pivot_df['Continent'] = pivot_df.index.map(country_to_continent)
            kmeans = KMeans(n_clusters=4, random_state=42)
            pivot_df['Cluster'] = kmeans.fit_predict(pivot_df[economic_metrics[:2]])
            
            plt.figure(figsize=(16, 10))
            sns.scatterplot(
                data=pivot_df,
                x='NGDPRPPPPC',
                y='PCPIPCH',
                hue='Cluster',
                size='LUR',
                style='Continent',
                sizes=(50, 300),
                palette='viridis'
            )
            plt.title('Economic Profile Clustering')
            plt.xscale('log')
            plt.xlabel('GDP per Capita (PPP)')
            plt.ylabel('Inflation Rate (%)')
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.savefig('photos/economic_clusters.png')
            plt.close()

    # 5. Forecasting Model (Updated Feature Alignment)
    model_countries = [c for c in top_economies + african_countries if c in df['Country'].unique()]
    model_data = df[
        (df['WEO Subject Code'] == 'NGDPRPPPPC') &
        df['Country'].isin(model_countries)
    ]
    
    if not model_data.empty:
        # Ensure consistent feature order
        required_years = sorted([col for col in year_cols if 1980 <= int(col) <= 2016], key=int)
        
        X = model_data[required_years]  # Explicit 1980-2016 features
        y = model_data[end_year]
        
        model = RandomForestRegressor(
            n_estimators=300,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X, y)
        
        # Save verified features
        os.makedirs('model', exist_ok=True)
        joblib.dump(model, 'model/gdp_forecast_model.pkl')
        joblib.dump(required_years, 'model/top_features.pkl')  # Now contains exact 44 years

    # 6. Country Reports
    def generate_country_report(country):
        country_df = df[
            (df['Country'] == country) &
            (df['WEO Subject Code'].isin(economic_metrics))
        ]
        
        if not country_df.empty:
            plt.figure(figsize=(14, 7))
            for metric in economic_metrics:
                data = country_df[country_df['WEO Subject Code'] == metric]
                if not data.empty:
                    plt.plot(
                        [int(yr) for yr in year_cols],
                        data[year_cols].values[0],
                        label=metric,
                        marker='o'
                    )
            plt.title(f'{country} Economic Trends')
            plt.xlabel('Year')
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(f'photos/{country}_report.png')
            plt.close()
    
    for country in model_countries:
        generate_country_report(country)

if __name__ == "__main__":
    df, year_cols = load_data()
    print(f"Available analysis years: {year_cols}")
    perform_analysis(df, year_cols)
    print("Analysis complete! Check photos/ directory for outputs.")

Available analysis years: ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']
Analysis complete! Check photos/ directory for outputs.


In [3]:
print("Required years:", joblib.load('model/top_features.pkl'))

Required years: ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
