In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load dataset
file_path = r"D:\Projects\GDP_Prediction_Project\data\processed\cleaned_data.csv"
df = pd.read_csv(file_path)

# Define results folder
results_folder = r"D:\Projects\GDP_Prediction_Project\results"
os.makedirs(results_folder, exist_ok=True)  # Create folder if not exists

# Save dataset overview
dataset_info = df.info()
with open(os.path.join(results_folder, "dataset_info.txt"), "w") as f:
    df.info(buf=f)

# Save missing values summary
missing_values = df.isnull().sum()
missing_values.to_csv(os.path.join(results_folder, "missing_values.csv"))

# Save summary statistics
summary_stats = df.describe()
summary_stats.to_csv(os.path.join(results_folder, "summary_statistics.csv"))

# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Economic Indicators")
plt.savefig(os.path.join(results_folder, "correlation_matrix.png"))
plt.close()

# Check distribution of GDP Growth
plt.figure(figsize=(8, 5))
sns.histplot(df["GDP Growth (%)"], kde=True, bins=20)
plt.title("Distribution of GDP Growth (%)")
plt.xlabel("GDP Growth (%)")
plt.ylabel("Frequency")
plt.savefig(os.path.join(results_folder, "gdp_growth_distribution.png"))
plt.close()

# Time series trend of GDP Growth
if "Year" in df.columns:
    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
    plt.figure(figsize=(10, 5))
    sns.lineplot(x="Year", y="GDP Growth (%)", data=df, marker="o")
    plt.title("GDP Growth Trend Over the Years")
    plt.xlabel("Year")
    plt.ylabel("GDP Growth (%)")
    plt.xticks(rotation=45)
    plt.grid()
    plt.savefig(os.path.join(results_folder, "gdp_growth_trend.png"))
    plt.close()

# Pairplot for key economic indicators (selecting first 6 numeric columns for readability)
num_cols = df.select_dtypes(include=[np.number]).columns[:6]
sns.pairplot(df[num_cols])
plt.savefig(os.path.join(results_folder, "pairplot.png"))
plt.close()

# Check for outliers using boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[num_cols])
plt.xticks(rotation=45)
plt.title("Boxplot for Outlier Detection")
plt.savefig(os.path.join(results_folder, "gdp_growth_boxplot.png"))
plt.close()

print("✅ EDA completed. Results saved in the 'results/' folder.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                44 non-null     int64  
 1   GDP Growth (%)                      44 non-null     float64
 2   Inflation Rate (%)                  44 non-null     float64
 3   Interest Rate (%)                   44 non-null     float64
 4   Exchange Rate (USD/INR)             44 non-null     float64
 5   Fiscal Deficit (% of GDP)           44 non-null     float64
 6   Exports (Billion USD)               44 non-null     float64
 7   Imports (Billion USD)               44 non-null     float64
 8   FDI (Billion USD)                   44 non-null     float64
 9   Money Supply (M3) Growth (%)        44 non-null     float64
 10  Bank Credit Growth (%)              44 non-null     float64
 11  Unemployment Rate (%)               44 non-null