# Economic Indicators Dataset Analysis

This notebook loads, cleans, and merges economic indicators from World Bank data sources for comprehensive analysis.

In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import html
import re
import os
import warnings
warnings.filterwarnings("ignore")

## 1. Import Required Libraries

Import necessary libraries for data processing and analysis.

In [None]:
data_folder = os.path.join('..', 'data/row')
files = {
    "size_economy": os.path.join(data_folder, "WV.1_Size_of_the_economy.xls"),
    "growth_gdp": os.path.join(data_folder, "4.1_Growth_of_Gross_Domestic_Product.xls"),
    "structure_value": os.path.join(data_folder, "4.2_Structure_of_value_added.xls"),
    "structure_manufacturing": os.path.join(data_folder, "4.3_Structure_of_manufacturing.xls"),
    "structure_exports": os.path.join(data_folder, "4.4_Structure_of_merchandise_exports.xls"),
    "unemployment": os.path.join(data_folder, "2.5_Unemployment.xls"),
    "poverty1": os.path.join(data_folder, "1.2_Poverty_rates_at_international_poverty_lines.xls"),
    "poverty2": os.path.join(data_folder, "1.2.2_Poverty_rates_at_international_poverty_lines_Part_2.xls")
}

## 2. Define File Paths

Set up the data folder and define file paths for all economic indicator datasets.

In [128]:
def load_and_clean_table(path, skip_cols=0):
    try:
        # First try reading as HTML disguised as XLS
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()

        dfs = pd.read_html(StringIO(html_content))
        df = dfs[0]

    except Exception:
        # Fallback: real Excel (just in case)
        df = pd.read_excel(path, header=[0,1,2,3])

    # ---- Handle multi-index or normal headers ----
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            '_'.join([str(c) for c in col if 'Unnamed' not in str(c)])
            for col in df.columns
        ]
    else:
        df.columns = df.columns.astype(str)

    # Rename first column ‚Üí Country
    df.rename(columns={df.columns[0]: 'Country'}, inplace=True)

    # Skip first N data columns (except Country)
    if skip_cols > 0:
        keep = ['Country'] + list(df.columns[1 + skip_cols:])
        df = df[keep]

    # Replace World Bank missing marker
    df.replace('..', pd.NA, inplace=True)

    # Convert numeric columns
    for col in df.columns:
        if col != 'Country':
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df


## 3. Define Data Cleaning Function

Create a function to load and clean Excel/HTML tables with proper header handling and data type conversion.

In [129]:
print("\nüì• 1. Loading 'Size of Economy' file (skipping first 3 columns)...")
size_df = load_and_clean_table(files["size_economy"], skip_cols=3)
print(f"   ‚úÖ Loaded: {size_df.shape}")
print(f"   Columns: {list(size_df.columns)}")
size_df.head()


üì• 1. Loading 'Size of Economy' file (skipping first 3 columns)...
   ‚úÖ Loaded: (227, 7)
   Columns: ['Country', 'Gross national income, Atlas method_$ billions_2024', 'Gross national income per capita, Atlas method_$_2024', 'Purchasing power parity gross national income_$ billions_2024', 'Purchasing power parity gross national income_per capita_$_2024', 'Gross domestic product_% growth_2024', 'Gross domestic product_per capita_% growth_2024']


Unnamed: 0,Country,"Gross national income, Atlas method_$ billions_2024","Gross national income per capita, Atlas method_$_2024",Purchasing power parity gross national income_$ billions_2024,Purchasing power parity gross national income_per capita_$_2024,Gross domestic product_% growth_2024,Gross domestic product_per capita_% growth_2024
0,Afghanistan,15.5448,370.0,91.71,2210.0,2.266944,0.106093
1,Albania,23.5944,8690.0,63.2762,23310.0,3.961719,5.162522
2,Algeria,249.052,5320.0,806.013,17220.0,3.3,1.86552
3,American Samoa,,,,,1.735016,3.593276
4,Andorra,4.00449,48870.0,6.40494,78170.0,3.37182,2.006784


## 4. Load Size of Economy Data

Load the first dataset: Size of the economy indicators with column skipping.

In [130]:
print("\nüì• 2. Loading economy files...")

# 2.1 Growth of GDP (4.1) - This is "growth of output"
print("   ‚Ä¢ Loading 4.1 Growth of Gross Domestic Product...")
growth_gdp_df = load_and_clean_table(files["growth_gdp"], skip_cols=0)
print(f"     ‚úÖ Shape: {growth_gdp_df.shape}")

# 2.2 Structure of value added (4.2) - This is "structure of output"
print("   ‚Ä¢ Loading 4.2 Structure of value added...")
structure_value_df = load_and_clean_table(files["structure_value"], skip_cols=0)
print(f"     ‚úÖ Shape: {structure_value_df.shape}")

# 2.3 Structure of manufacturing (4.3)
print("   ‚Ä¢ Loading 4.3 Structure of manufacturing...")
structure_manuf_df = load_and_clean_table(files["structure_manufacturing"], skip_cols=0)
print(f"     ‚úÖ Shape: {structure_manuf_df.shape}")

# 2.4 Structure of merchandise exports (4.4)
print("   ‚Ä¢ Loading 4.4 Structure of merchandise exports...")
structure_exports_df = load_and_clean_table(files["structure_exports"], skip_cols=0)
print(f"     ‚úÖ Shape: {structure_exports_df.shape}")


üì• 2. Loading economy files...
   ‚Ä¢ Loading 4.1 Growth of Gross Domestic Product...
     ‚úÖ Shape: (226, 11)
   ‚Ä¢ Loading 4.2 Structure of value added...
     ‚úÖ Shape: (226, 11)
   ‚Ä¢ Loading 4.3 Structure of manufacturing...
     ‚úÖ Shape: (226, 13)
   ‚Ä¢ Loading 4.4 Structure of merchandise exports...
     ‚úÖ Shape: (226, 13)


## 5. Load Economic Structure Data

Load datasets related to GDP growth, value added structure, manufacturing, and export structures.

In [131]:
growth_gdp_df.head()

Unnamed: 0,Country,Gross domestic product_average annual real growth (%)_2010-2020,Gross domestic product_average annual real growth (%)_2020-2024,Agriculture_average annual real growth (%)_2010-2020,Agriculture_average annual real growth (%)_2020-2024,Industry_average annual real growth (%)_2010-2020,Industry_average annual real growth (%)_2020-2024,Manufacturing_average annual real growth (%)_2010-2020,Manufacturing_average annual real growth (%)_2020-2024,Services_average annual real growth (%)_2010-2020,Services_average annual real growth (%)_2020-2024
0,Afghanistan,3.2,-8.5,2.9,-5.0,4.8,-5.8,5.7,-7.0,3.2,-12.2
1,Albania,2.3,5.2,2.1,-2.6,1.9,5.3,5.7,0.2,3.0,7.8
2,Algeria,2.3,3.7,4.2,2.2,1.0,5.1,4.6,7.7,3.5,3.7
3,American Samoa,-0.7,0.5,,,,,,,,
4,Andorra,0.0,5.9,0.7,1.0,-0.2,6.1,0.0,2.7,0.2,5.2


## 6. Preview GDP Growth Data

In [132]:
structure_value_df.head()

Unnamed: 0,Country,Gross domestic product_$ billions_2015,Gross domestic product_$ billions_2024,Agriculture_% of GDP_2015,Agriculture_% of GDP_2024,Industry_% of GDP_2015,Industry_% of GDP_2024,Manufacturing_% of GDP_2015,Manufacturing_% of GDP_2024,Services_% of GDP_2015,Services_% of GDP_2024
0,Afghanistan,19.1,17.2,20.6,34.7,22.1,13.4,11.4,7.5,53.2,46.4
1,Albania,11.5,27.2,18.7,15.5,24.5,22.4,6.3,6.2,42.9,48.9
2,Algeria,187.5,263.6,10.5,13.1,32.8,37.8,7.1,9.3,52.1,45.6
3,American Samoa,0.7,0.9,,,,,,,,
4,Andorra,2.8,4.0,0.5,0.5,10.0,12.8,3.7,3.4,78.7,77.6


## 7. Preview Structure of Value Added Data

In [133]:
structure_manuf_df.head()

Unnamed: 0,Country,Manufacturing value added_$ billions_2010,Manufacturing value added_$ billions_2022,"Food, beverages and tobacco_%of total_2010","Food, beverages and tobacco_%of total_2022",Textiles and clothing_%of total_2010,Textiles and clothing_%of total_2022,Machinery and transport equipment_%of total_2010,Machinery and transport equipment_%of total_2022,Chemicals_%of total_2010,Chemicals_%of total_2022,Other manufacturing_%of total_2010,Other manufacturing_%of total_2022
0,Afghanistan,2.0,1.5,,,,,,,,,,
1,Albania,0.7,1.4,17.7,19.9,26.4,30.7,2.8,36.3,2.1,1.8,50.9,11.2
2,Algeria,16.7,23.0,,,,,,,,,,
3,American Samoa,,,,,,,,,,,,
4,Andorra,0.1,0.1,,,,,,,,,,


## 8. Preview Manufacturing Structure Data

In [134]:
structure_exports_df.head()

Unnamed: 0,Country,Merchandise exports_$ millions_2015,Merchandise exports_$ millions_2023,Food_% of total_2015,Food_% of total_2023,Agricultural raw materials_% of total_2015,Agricultural raw materials_% of total_2023,Fuels_% of total_2015,Fuels_% of total_2023,Ores and metals_% of total_2015,Ores and metals_% of total_2023,Manufactures_% of total_2015,Manufactures_% of total_2023
0,Afghanistan,571.0,847.0,48.5,,14.8,,3.5,,1.1,,15.9,
1,Albania,1917.0,4374.0,5.6,12.0,2.3,0.4,8.8,6.3,7.5,5.3,52.7,56.6
2,Algeria,34668.0,55555.0,0.7,,0.0,,95.8,,0.3,,3.1,
3,American Samoa,379.0,395.0,,,,,,,,,,
4,Andorra,90.0,252.0,1.1,1.1,1.6,0.4,0.0,0.0,4.5,1.8,88.8,95.0


## 9. Preview Export Structure Data

In [135]:
print("\nüì• 3. Loading unemployment data...")
unemployment_df = load_and_clean_table(files["unemployment"], skip_cols=0)
print(f"   ‚úÖ Shape: {unemployment_df.shape}")
unemployment_df.head()


üì• 3. Loading unemployment data...
   ‚úÖ Shape: (264, 12)


Unnamed: 0,Country,"Unemployment, male (% of male labor force) (modeled ILO estimate)_2015","Unemployment, male (% of male labor force) (modeled ILO estimate)_2021","Unemployment, female (% of female labor force) (modeled ILO estimate)_2015","Unemployment, female (% of female labor force) (modeled ILO estimate)_2021","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)_2015","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)_2021","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)_2015","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)_2021",Unemployment with basic education (% of total labor force with basic education)_2015-21,Unemployment with intermediate education (% of total labor force with intermediate education)_2015-21,Unemployment with advanced education (% of total labor force with advanced education)_2015-21
0,Afghanistan,8.401,10.911,11.611,16.945,11.537,15.033,16.05,21.372,6.034,9.459,9.797
1,Albania,17.248,11.307,17.122,11.678,39.3,25.414,40.195,30.076,10.109,13.279,10.379
2,Algeria,10.028,11.638,16.741,23.351,27.396,30.659,45.216,51.12,,,
3,American Samoa,,,,,,,,,,,
4,Andorra,,,,,,,,,,,


## 10. Load Unemployment Data

In [136]:
print("\nüì• 4. Loading poverty data...")

# 4.1 First poverty file (1.2)
print("   ‚Ä¢ Loading 1.2 Poverty rates at international poverty lines...")
poverty1_df = load_and_clean_table(files["poverty1"], skip_cols=0)
print(f"     ‚úÖ Shape: {poverty1_df.shape}")

# 4.2 Second poverty file (1.2.2)
print("   ‚Ä¢ Loading 1.2.2 Poverty rates at international poverty lines Part 2...")
poverty2_df = load_and_clean_table(files["poverty2"], skip_cols=0)
print(f"     ‚úÖ Shape: {poverty2_df.shape}")


üì• 4. Loading poverty data...
   ‚Ä¢ Loading 1.2 Poverty rates at international poverty lines...
     ‚úÖ Shape: (217, 5)
   ‚Ä¢ Loading 1.2.2 Poverty rates at international poverty lines Part 2...
     ‚úÖ Shape: (2, 19)


## 11. Load Poverty Data

Load poverty rates at international poverty lines (two datasets).

In [137]:
print(poverty1_df.shape)
poverty1_df.head()

(217, 5)


Unnamed: 0,Country,International poverty lines_Population below $3.00 a day_%,International poverty lines_Population below $3.00 a day_%.1,International poverty lines_Population below $4.20 a day_%,International poverty lines_Population below $8.30 a day_%
0,,,,,
1,Afghanistan,,,,
2,Albania,2020.0,0.3,2.1,19.9
3,Algeria,2011.0,0.0,4.7,41.8
4,American Samoa,,,,


## 12. Preview First Poverty Dataset

In [138]:
poverty2_df.head()

Unnamed: 0,Country,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,,,,,,,,,,,,,,,,,,,
1,,1990.0,1993.0,1996.0,1999.0,2002.0,2005.0,2008.0,2010.0,2011.0,2012.0,2013.0,2015.0,2018.0,2019.0,2020.0,2021.0,2022.0,2023.0


## 13. Preview Second Poverty Dataset

In [139]:
print("\nüîç 5. Checking for duplicate column names...")

# Collect all column names (except 'Country') from all dataframes
all_columns = {}

# Add columns from each dataframe
dataframes = {
    "Size": size_df,
    "GDP_Growth": growth_gdp_df,
    "Structure_Value": structure_value_df,
    "Structure_Manufacturing": structure_manuf_df,
    "Structure_Exports": structure_exports_df,
    "Unemployment": unemployment_df,
    "Poverty1": poverty1_df,
    "Poverty2": poverty2_df
}

for df_name, df in dataframes.items():
    print(f"\n   Checking {df_name}...")
    cols = [col for col in df.columns if col != 'Country']
    
    # Check for duplicates within this dataframe
    duplicates = [col for col in cols if cols.count(col) > 1]
    if duplicates:
        print(f"     ‚ö†Ô∏è  Internal duplicates found: {duplicates}")
    
    # Track columns across dataframes
    for col in cols:
        if col in all_columns:
            all_columns[col].append(df_name)
        else:
            all_columns[col] = [df_name]

# Find columns that appear in multiple dataframes
print("\n   Columns appearing in multiple dataframes:")
duplicate_found = False
for col, sources in all_columns.items():
    if len(sources) > 1:
        print(f"     ‚ö†Ô∏è  '{col}' appears in: {', '.join(sources)}")
        duplicate_found = True

if not duplicate_found:
    print("     ‚úÖ No duplicate column names across dataframes")

# Show column counts
print(f"\n   Total unique column names (excluding 'Country'): {len(all_columns)}")
print(f"   Column distribution:")
for df_name, df in dataframes.items():
    cols = [col for col in df.columns if col != 'Country']
    print(f"     ‚Ä¢ {df_name}: {len(cols)} columns")


üîç 5. Checking for duplicate column names...

   Checking Size...

   Checking GDP_Growth...

   Checking Structure_Value...

   Checking Structure_Manufacturing...

   Checking Structure_Exports...

   Checking Unemployment...

   Checking Poverty1...

   Checking Poverty2...

   Columns appearing in multiple dataframes:
     ‚úÖ No duplicate column names across dataframes

   Total unique column names (excluding 'Country'): 83
   Column distribution:
     ‚Ä¢ Size: 6 columns
     ‚Ä¢ GDP_Growth: 10 columns
     ‚Ä¢ Structure_Value: 10 columns
     ‚Ä¢ Structure_Manufacturing: 12 columns
     ‚Ä¢ Structure_Exports: 12 columns
     ‚Ä¢ Unemployment: 11 columns
     ‚Ä¢ Poverty1: 4 columns
     ‚Ä¢ Poverty2: 18 columns


## 14. Check for Duplicate Column Names

Verify that no columns are duplicated across different dataframes to prevent data loss during merging.

In [140]:
print("\nüè∑Ô∏è 6. Adding prefixes for clarity (optional)...")

print("   Note: No duplicates found, but adding prefixes for better identification")
print("   Applying prefixes:")

# Add prefixes to each dataframe
growth_gdp_df = growth_gdp_df.rename(columns={
    col: f"GDP_Growth_{col}" if col != 'Country' else col 
    for col in growth_gdp_df.columns[1:]
})

structure_value_df = structure_value_df.rename(columns={
    col: f"Structure_Value_{col}" if col != 'Country' else col 
    for col in structure_value_df.columns[1:]
})

structure_manuf_df = structure_manuf_df.rename(columns={
    col: f"Structure_Manufacturing_{col}" if col != 'Country' else col 
    for col in structure_manuf_df.columns[1:]
})

structure_exports_df = structure_exports_df.rename(columns={
    col: f"Structure_Exports_{col}" if col != 'Country' else col 
    for col in structure_exports_df.columns[1:]
})

unemployment_df = unemployment_df.rename(columns={
    col: f"Unemployment_{col}" if col != 'Country' else col 
    for col in unemployment_df.columns[1:]
})

poverty1_df = poverty1_df.rename(columns={
    col: f"Poverty_IntlLine1_{col}" if col != 'Country' else col 
    for col in poverty1_df.columns[1:]
})

poverty2_df = poverty2_df.rename(columns={
    col: f"Poverty_IntlLine2_{col}" if col != 'Country' else col 
    for col in poverty2_df.columns[1:]
})

print("   ‚úÖ All prefixes added")


üè∑Ô∏è 6. Adding prefixes for clarity (optional)...
   Note: No duplicates found, but adding prefixes for better identification
   Applying prefixes:
   ‚úÖ All prefixes added


## 15. Add Prefixes to Columns

Add descriptive prefixes to all columns for better clarity and identification of data sources.

In [141]:
print("\nüîó 7. Merging all data for ALL countries...")

# First, ensure all 'Country' columns are strings
print("   Converting 'Country' columns to strings...")

def ensure_country_string(df):
    """Ensure Country column is string type"""
    if 'Country' in df.columns:
        df['Country'] = df['Country'].astype(str).str.strip()
    return df

# Apply to all dataframes
size_df = ensure_country_string(size_df)
growth_gdp_df = ensure_country_string(growth_gdp_df)
structure_value_df = ensure_country_string(structure_value_df)
structure_manuf_df = ensure_country_string(structure_manuf_df)
structure_exports_df = ensure_country_string(structure_exports_df)
unemployment_df = ensure_country_string(unemployment_df)
poverty1_df = ensure_country_string(poverty1_df)
poverty2_df = ensure_country_string(poverty2_df)

print("   ‚úÖ All 'Country' columns converted to strings")

# Start with size of economy
economic_indicators_dataset = size_df.copy()
print(f"   Starting with Size: {size_df.shape[0]} countries, {size_df.shape[1]-1} indicators")

# List of all dataframes to merge (in order)
dataframes_to_merge = [
    ("GDP Growth", growth_gdp_df),
    ("Structure Value", structure_value_df),
    ("Structure Manufacturing", structure_manuf_df),
    ("Structure Exports", structure_exports_df),
    ("Unemployment", unemployment_df),
    ("Poverty Intl Line 1", poverty1_df),
    ("Poverty Intl Line 2", poverty2_df)
]

# Merge one by one with error handling
total_indicators = size_df.shape[1] - 1
merge_count = 0

for name, df in dataframes_to_merge:
    if not df.empty and 'Country' in df.columns:
        try:
            before_countries = economic_indicators_dataset.shape[0]
            before_indicators = economic_indicators_dataset.shape[1] - 1
            
            # Check for common countries before merge
            common_countries = set(economic_indicators_dataset['Country']).intersection(set(df['Country']))
            print(f"\n   Merging {name}...")
            print(f"     Common countries: {len(common_countries)}")
            
            # Merge using outer join
            economic_indicators_dataset = pd.merge(economic_indicators_dataset, df, on='Country', how='outer')
            merge_count += 1
            
            after_countries = economic_indicators_dataset.shape[0]
            after_indicators = economic_indicators_dataset.shape[1] - 1
            new_indicators = after_indicators - before_indicators
            
            print(f"     ‚úÖ Successfully merged!")
            print(f"       Countries: {before_countries} ‚Üí {after_countries}")
            print(f"       Indicators: +{new_indicators} (total: {after_indicators})")
            
        except Exception as e:
            print(f"     ‚ùå Error merging {name}: {str(e)}")
            print(f"     Trying alternative merge method...")
            
            # Alternative method: Use concat with alignment
            try:
                # Reset index and merge manually
                temp1 = economic_indicators_dataset.set_index('Country')
                temp2 = df.set_index('Country')
                combined = pd.concat([temp1, temp2], axis=1)
                economic_indicators_dataset = combined.reset_index().rename(columns={'index': 'Country'})
                print(f"     ‚úÖ Merged using alternative method")
            except Exception as e2:
                print(f"     ‚ùå Alternative method also failed: {e2}")
    else:
        print(f"\n   ‚ö†Ô∏è  Skipping {name}: DataFrame is empty or missing 'Country' column")

print(f"\n   üìä Merge completed: {merge_count}/{len(dataframes_to_merge)} dataframes merged")
print(f"   Final dataset shape: {economic_indicators_dataset.shape}")
print(f"   üåç Total countries/regions: {economic_indicators_dataset.shape[0]}")
print(f"   üìà Total economic indicators: {economic_indicators_dataset.shape[1] - 1}")

# Check data types
print(f"\n   üîç Data type check:")
print(f"     'Country' column type: {economic_indicators_dataset['Country'].dtype}")
print(f"     Sample country names: {economic_indicators_dataset['Country'].head(5).tolist()}")

# Check for Algeria
print(f"\n   üîç Looking for Algeria...")
algeria_matches = economic_indicators_dataset[economic_indicators_dataset['Country'].str.contains('Algeria', case=False, na=False)]
if not algeria_matches.empty:
    print(f"     ‚úÖ Algeria found! {len(algeria_matches)} match(es)")
    for idx, row in algeria_matches.iterrows():
        print(f"       Row {idx}: '{row['Country']}'")
else:
    print("     ‚ùå Algeria not found in dataset")
    # Try to find similar names
    all_countries = economic_indicators_dataset['Country'].astype(str).tolist()
    similar = [c for c in all_countries if 'alg' in c.lower()]
    if similar:
        print(f"     Similar names found: {similar}")


üîó 7. Merging all data for ALL countries...
   Converting 'Country' columns to strings...
   ‚úÖ All 'Country' columns converted to strings
   Starting with Size: 227 countries, 6 indicators

   Merging GDP Growth...
     Common countries: 226
     ‚úÖ Successfully merged!
       Countries: 227 ‚Üí 227
       Indicators: +10 (total: 16)

   Merging Structure Value...
     Common countries: 226
     ‚úÖ Successfully merged!
       Countries: 227 ‚Üí 227
       Indicators: +10 (total: 26)

   Merging Structure Manufacturing...
     Common countries: 226
     ‚úÖ Successfully merged!
       Countries: 227 ‚Üí 227
       Indicators: +12 (total: 38)

   Merging Structure Exports...
     Common countries: 226
     ‚úÖ Successfully merged!
       Countries: 227 ‚Üí 227
       Indicators: +12 (total: 50)

   Merging Unemployment...
     Common countries: 227
     ‚úÖ Successfully merged!
       Countries: 227 ‚Üí 264
       Indicators: +11 (total: 61)

   Merging Poverty Intl Line 1...
    

## 16. Merge All Datasets

Merge all economic indicator dataframes into a single comprehensive dataset using outer joins.

In [142]:
print("\nüßπ 8. Cleaning merged dataset...")

# Remove any duplicate country rows
before_dedup = economic_indicators_dataset.shape[0]
economic_indicators_dataset = economic_indicators_dataset.drop_duplicates(subset='Country', keep='first')
after_dedup = economic_indicators_dataset.shape[0]
if before_dedup != after_dedup:
    print(f"   Removed {before_dedup - after_dedup} duplicate country rows")

# Clean country names (remove extra spaces, fix capitalization)
economic_indicators_dataset['Country'] = economic_indicators_dataset['Country'].str.strip()

# Replace any remaining '..' with NaN
economic_indicators_dataset = economic_indicators_dataset.replace('..', pd.NA)

# Sort by country name
economic_indicators_dataset = economic_indicators_dataset.sort_values('Country').reset_index(drop=True)

print(f"   ‚úÖ Dataset cleaned and sorted")
print(f"   Final shape: {economic_indicators_dataset.shape}")

# ==============================================
# 9. SAVE DATASET 1
# ==============================================
print("\nüíæ 9. Saving Dataset 1...")

output_filename = "../data/preprocessed/economic_indicators_dataset.csv"
economic_indicators_dataset.to_csv(output_filename, index=False)

print(f"   ‚úÖ Saved as: {output_filename}")

# Verify the file
if os.path.exists(output_filename):
    verify_df = pd.read_csv(output_filename)
    print(f"   üìä Verification: {verify_df.shape[0]} rows, {verify_df.shape[1]} columns")
    print(f"   üåç Countries count: {verify_df.shape[0]}")
    print(f"   üìà Indicators count: {verify_df.shape[1] - 1}")
    
    # Check for Algeria
    algeria_in_file = verify_df[verify_df['Country'].str.contains('Algeria', case=False, na=False)]
    if not algeria_in_file.empty:
        print(f"   üá©üáø Algeria in saved file: YES")
    else:
        print(f"   üá©üáø Algeria in saved file: NO")
else:
    print("   ‚ùå ERROR: File was not created")

print("\n" + "="*60)
print("‚úÖ DATASET 1 CREATION COMPLETED!")
print("="*60)


üßπ 8. Cleaning merged dataset...
   Removed 1 duplicate country rows
   ‚úÖ Dataset cleaned and sorted
   Final shape: (266, 84)

üíæ 9. Saving Dataset 1...
   ‚úÖ Saved as: ../data/preprocessed/economic_indicators_dataset.csv
   üìä Verification: 266 rows, 84 columns
   üåç Countries count: 266
   üìà Indicators count: 83
   üá©üáø Algeria in saved file: YES

‚úÖ DATASET 1 CREATION COMPLETED!


## 17. Final Dataset Cleaning and Export

Clean the merged dataset, remove duplicates, sort by country, and save to CSV file.

In [143]:
print(economic_indicators_dataset.columns)

Index(['Country', 'Gross national income, Atlas method_$ billions_2024',
       'Gross national income per capita, Atlas method_$_2024',
       'Purchasing power parity gross national income_$ billions_2024',
       'Purchasing power parity gross national income_per capita_$_2024',
       'Gross domestic product_% growth_2024',
       'Gross domestic product_per capita_% growth_2024',
       'GDP_Growth_Gross domestic product_average annual real growth (%)_2010-2020',
       'GDP_Growth_Gross domestic product_average annual real growth (%)_2020-2024',
       'GDP_Growth_Agriculture_average annual real growth (%)_2010-2020',
       'GDP_Growth_Agriculture_average annual real growth (%)_2020-2024',
       'GDP_Growth_Industry_average annual real growth (%)_2010-2020',
       'GDP_Growth_Industry_average annual real growth (%)_2020-2024',
       'GDP_Growth_Manufacturing_average annual real growth (%)_2010-2020',
       'GDP_Growth_Manufacturing_average annual real growth (%)_2020-2024',
