In [2]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set Seaborn style
sns.set_style('whitegrid')

In [6]:
# Step 2: Load Dataset
data_path = "/Users/deepakdeokar/Desktop/GDP_Forecasting_Project/data/worldbank_gdp_1960_2023.csv"

# Load the dataset
df = pd.read_csv(data_path)

In [10]:
# Step 3: Initial Data Overview
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataset shape:", df.shape)
print("\nDataset columns:", df.columns.tolist())


First 5 rows of the dataset:
       country  Year  GDP Growth Rate (%)  GDP (current US$)  \
0  Afghanistan  1960                  NaN                NaN   
1  Afghanistan  1961                  NaN                NaN   
2  Afghanistan  1962                  NaN                NaN   
3  Afghanistan  1963                  NaN                NaN   
4  Afghanistan  1964                  NaN                NaN   

   GNI per capita (current US$)  Exports (% of GDP)  Imports (% of GDP)  
0                           NaN                 NaN                 NaN  
1                           NaN                 NaN                 NaN  
2                           NaN                 NaN                 NaN  
3                           NaN                 NaN                 NaN  
4                           NaN                 NaN                 NaN  

Dataset shape: (17024, 7)

Dataset columns: ['country', 'Year', 'GDP Growth Rate (%)', 'GDP (current US$)', 'GNI per capita (current US$)', 

In [9]:
# Step 4: Basic Info
print("\nData Types and Missing Values:")
df.info()

print("\nSummary Statistics:")
print(df.describe())


Data Types and Missing Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17024 entries, 0 to 17023
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       17024 non-null  object 
 1   Year                          17024 non-null  int64  
 2   GDP Growth Rate (%)           13883 non-null  float64
 3   GDP (current US$)             14307 non-null  float64
 4   GNI per capita (current US$)  12702 non-null  float64
 5   Exports (% of GDP)            11047 non-null  float64
 6   Imports (% of GDP)            11069 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 931.1+ KB

Summary Statistics:
               Year  GDP Growth Rate (%)  GDP (current US$)  \
count  17024.000000         13883.000000       1.430700e+04   
mean    1991.500000             3.673056       1.184720e+12   
std       18.473496             6.169865       5.489324

In [11]:
# Step 5: Check Missing Values
print("\nMissing Value Count Per Column:")
print(df.isnull().sum())


Missing Value Count Per Column:
country                            0
Year                               0
GDP Growth Rate (%)             3141
GDP (current US$)               2717
GNI per capita (current US$)    4322
Exports (% of GDP)              5977
Imports (% of GDP)              5955
dtype: int64


In [12]:
# Step 6: Unique Countries and Year Range
print("\nNumber of unique countries:", df['country'].nunique())
print("Year range:", df['Year'].min(), "to", df['Year'].max())


Number of unique countries: 266
Year range: 1960 to 2023


In [15]:
# Step 7: Check Sample Countries with Missing Data
missing_by_country = df.groupby('country').apply(lambda x: x.isnull().sum())
print("\nSample missing values by country:")
missing_by_country.head()


Sample missing values by country:


  missing_by_country = df.groupby('country').apply(lambda x: x.isnull().sum())


Unnamed: 0_level_0,country,Year,GDP Growth Rate (%),GDP (current US$),GNI per capita (current US$),Exports (% of GDP),Imports (% of GDP)
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,0,0,41,40,42,60,60
Africa Eastern and Southern,0,0,1,0,2,20,30
Africa Western and Central,0,0,1,0,48,64,64
Albania,0,0,21,20,24,20,20
Algeria,0,0,1,0,2,0,0


# Preprocessing

In [16]:
# Step 9: Deep Cleaning

## 9.1 Strip extra spaces in country names
df['country'] = df['country'].str.strip()

## 9.2 Ensure Year is integer
df['Year'] = df['Year'].astype(int)

## 9.3 Sort dataset properly before interpolation
df = df.sort_values(by=['country', 'Year'])

In [18]:
## 9.4 Interpolate missing GDP Growth Rate (%) country-wise
df['GDP Growth Rate (%)'] = df.groupby('country')['GDP Growth Rate (%)'].transform(lambda x: x.interpolate(method='linear'))

## 9.5 Forward Fill and Backward Fill missing GDP Growth Rate (%)
df['GDP Growth Rate (%)'] = df.groupby('country')['GDP Growth Rate (%)'].transform(lambda x: x.ffill().bfill())

## 9.6 Fill remaining missing GDP Growth Rate (%) with Global Median
gdp_median = df['GDP Growth Rate (%)'].median()
df['GDP Growth Rate (%)'] = df['GDP Growth Rate (%)'].fillna(gdp_median)


In [19]:
## 9.7 Repeat the same for Exports, Imports, GNI, GDP
columns_to_clean = ['Exports (% of GDP)', 'Imports (% of GDP)', 'GNI per capita (current US$)', 'GDP (current US$)']

for col in columns_to_clean:
    df[col] = df.groupby('country')[col].transform(lambda x: x.interpolate(method='linear'))
    df[col] = df.groupby('country')[col].transform(lambda x: x.ffill().bfill())
    col_median = df[col].median()
    df[col] = df[col].fillna(col_median)


In [20]:
## 9.8 Final Check
print("\nAfter Cleaning - Missing Value Count:")
print(df.isnull().sum())



After Cleaning - Missing Value Count:
country                         0
Year                            0
GDP Growth Rate (%)             0
GDP (current US$)               0
GNI per capita (current US$)    0
Exports (% of GDP)              0
Imports (% of GDP)              0
dtype: int64


In [21]:
# Step 10: Save Cleaned Data (Optional)
clean_save_path = "../data/worldbank_gdp_cleaned.csv"
df.to_csv(clean_save_path, index=False)
print(f"\n✅ Cleaned dataset saved to: {clean_save_path}")


✅ Cleaned dataset saved to: ../data/worldbank_gdp_cleaned.csv
