# Population and GDP Analysis (2001-2021)

## Task A: Correlation Analysis
## Task B: Linear Regression

### Import required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

### Load the datasets

In [None]:
# Load GDP and Population data
gdp_df = pd.read_csv('data/Global_GDP.csv')
pop_df = pd.read_csv('data/Global_Population.csv')

print("GDP Dataset Shape:", gdp_df.shape)
print("Population Dataset Shape:", pop_df.shape)

### Explore the data structure

In [None]:
# Check GDP data structure
print("GDP Dataset Columns:")
print(gdp_df.columns.tolist()[:10], "... and year columns")
print("\nFirst 3 rows of GDP data:")
gdp_df.head(3)

In [None]:
# Check Population data structure
print("Population Dataset Columns:")
print(pop_df.columns.tolist()[:10], "... and year columns")
print("\nFirst 3 rows of Population data:")
pop_df.head(3)

### Data Preprocessing - Filter years 2001-2021

In [None]:
# Identify year columns (2001-2021)
# Note: GDP data might only go up to 2020
years_range = [str(year) for year in range(2001, 2022)]

# Find which years are actually available in both datasets
gdp_year_cols = [col for col in gdp_df.columns if col in years_range]
pop_year_cols = [col for col in pop_df.columns if col in years_range]

# Use only years that exist in both datasets
available_years = sorted(list(set(gdp_year_cols) & set(pop_year_cols)))

print(f"Years available in GDP data: {gdp_year_cols[0]} to {gdp_year_cols[-1]}")
print(f"Years available in Population data: {pop_year_cols[0]} to {pop_year_cols[-1]}")
print(f"\nCommon years for analysis ({len(available_years)} years): {available_years[0]} to {available_years[-1]}")

In [None]:
# Filter columns to include only country names and available years
gdp_cols = ['Country Name'] + available_years
pop_cols = ['Country Name'] + available_years

# Create filtered dataframes
gdp_filtered = gdp_df[gdp_cols].copy()
pop_filtered = pop_df[pop_cols].copy()

print(f"GDP filtered shape: {gdp_filtered.shape}")
print(f"Population filtered shape: {pop_filtered.shape}")

### Calculate mean values with missing value handling

In [None]:
# Convert year columns to numeric, handling any non-numeric values
for year in available_years:
    gdp_filtered[year] = pd.to_numeric(gdp_filtered[year], errors='coerce')
    pop_filtered[year] = pd.to_numeric(pop_filtered[year], errors='coerce')

# Calculate mean GDP for each country (2001-2020/2021)
gdp_filtered['Mean_GDP'] = gdp_filtered[available_years].mean(axis=1, skipna=True)

# Calculate mean population for each country
pop_filtered['Mean_Population'] = pop_filtered[available_years].mean(axis=1, skipna=True)

print("Missing values handled. Mean values calculated.")

In [None]:
# Create summary dataframe by merging GDP and Population data
summary_df = pd.merge(
    gdp_filtered[['Country Name', 'Mean_GDP']],
    pop_filtered[['Country Name', 'Mean_Population']],
    on='Country Name',
    how='inner'
)

# Calculate per capita GDP
summary_df['Per_Capita_GDP'] = summary_df['Mean_GDP'] / summary_df['Mean_Population']

# Remove rows with NaN or infinite values
summary_df = summary_df.replace([np.inf, -np.inf], np.nan)
summary_df = summary_df.dropna()

# Remove countries with zero population or GDP
summary_df = summary_df[(summary_df['Mean_Population'] > 0) & (summary_df['Mean_GDP'] > 0)]

print(f"Final dataset shape: {summary_df.shape}")
print(f"Number of countries with complete data: {len(summary_df)}")
print(f"\nSample of processed data:")
summary_df.head()

In [None]:
# Display summary statistics
print("Summary statistics:")
summary_df[['Mean_Population', 'Mean_GDP', 'Per_Capita_GDP']].describe()

## Task A: Correlation Analysis

In [None]:
# Create scatter plot
plt.figure(figsize=(12, 8))

# Regular scale plot
plt.subplot(2, 2, 1)
plt.scatter(summary_df['Mean_Population'], summary_df['Per_Capita_GDP'], alpha=0.6, s=50)
plt.xlabel('Mean Population')
plt.ylabel('Mean Per Capita GDP (USD)')
plt.title('Population vs Per Capita GDP (Linear Scale)')
plt.grid(True, alpha=0.3)

# Log scale plot
plt.subplot(2, 2, 2)
plt.scatter(summary_df['Mean_Population'], summary_df['Per_Capita_GDP'], alpha=0.6, s=50)
plt.xlabel('Mean Population (log scale)')
plt.ylabel('Mean Per Capita GDP (log scale)')
plt.title('Population vs Per Capita GDP (Log Scale)')
plt.xscale('log')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Calculate Pearson Correlation Coefficient
correlation, p_value = pearsonr(summary_df['Mean_Population'], summary_df['Per_Capita_GDP'])

print("="*50)
print("CORRELATION ANALYSIS RESULTS")
print("="*50)
print(f"Pearson Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.6f}")

# Statistical significance
if p_value < 0.05:
    print("\nStatistical Significance: YES (p < 0.05)")
else:
    print("\nStatistical Significance: NO (p >= 0.05)")

# Correlation strength interpretation
if abs(correlation) < 0.3:
    strength = "WEAK"
elif abs(correlation) < 0.7:
    strength = "MODERATE"
else:
    strength = "STRONG"

direction = "POSITIVE" if correlation > 0 else "NEGATIVE"

print(f"\nCorrelation Strength: {strength}")
print(f"Correlation Direction: {direction}")
print("\nInterpretation:")
print(f"There is a {strength.lower()} {direction.lower()} linear relationship between ")
print(f"mean population and mean per capita GDP across countries.")

In [None]:
# Create correlation matrix heatmap
plt.figure(figsize=(8, 6))
correlation_matrix = summary_df[['Mean_Population', 'Per_Capita_GDP']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.4f', square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Population vs Per Capita GDP', fontsize=14)
plt.tight_layout()
plt.show()

## Task B: Linear Regression Analysis

In [None]:
# Prepare data for regression
X = summary_df['Mean_Population'].values.reshape(-1, 1)  # Independent variable
y = summary_df['Per_Capita_GDP'].values  # Dependent variable

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Get model parameters
slope = model.coef_[0]
intercept = model.intercept_

print("="*50)
print("LINEAR REGRESSION RESULTS")
print("="*50)
print(f"\nRegression Equation:")
print(f"Per Capita GDP = {slope:.6e} × Population + {intercept:.2f}")
print(f"\nModel Parameters:")
print(f"  - Slope (β₁): {slope:.6e}")
print(f"  - Intercept (β₀): {intercept:.2f}")

In [None]:
# Model evaluation metrics
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
n = len(y)
k = 1  # number of predictors
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)

print("\nModel Performance Metrics:")
print(f"  - R-squared (R²): {r2:.4f}")
print(f"  - Adjusted R²: {adj_r2:.4f}")
print(f"  - Mean Squared Error (MSE): {mse:,.2f}")
print(f"  - Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"\nInterpretation:")
print(f"The model explains {r2*100:.1f}% of the variance in Per Capita GDP.")

In [None]:
# Visualize the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, s=50, label='Actual data')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression line')
plt.xlabel('Mean Population', fontsize=12)
plt.ylabel('Mean Per Capita GDP (USD)', fontsize=12)
plt.title('Linear Regression: Population vs Per Capita GDP', fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Residual Analysis

In [None]:
# Calculate residuals
residuals = y - y_pred

# Create residual plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))

# 1. Residual vs Fitted values
ax1.scatter(y_pred, residuals, alpha=0.6)
ax1.axhline(y=0, color='red', linestyle='--')
ax1.set_xlabel('Fitted Values (Predicted Per Capita GDP)')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals vs Fitted Values')
ax1.grid(True, alpha=0.3)

# 2. Histogram of residuals
ax2.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
ax2.set_xlabel('Residuals')
ax2.set_ylabel('Frequency')
ax2.set_title('Distribution of Residuals')
ax2.grid(True, alpha=0.3, axis='y')

# 3. Q-Q plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot')
ax3.grid(True, alpha=0.3)

# 4. Residuals vs Population
ax4.scatter(X, residuals, alpha=0.6)
ax4.axhline(y=0, color='red', linestyle='--')
ax4.set_xlabel('Mean Population')
ax4.set_ylabel('Residuals')
ax4.set_title('Residuals vs Population')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Alternative: Log-transformed Regression

In [None]:
# Log transform the data (already filtered for positive values)
X_log = np.log(summary_df['Mean_Population'].values).reshape(-1, 1)
y_log = np.log(summary_df['Per_Capita_GDP'].values)

# Fit log-transformed model
model_log = LinearRegression()
model_log.fit(X_log, y_log)
y_log_pred = model_log.predict(X_log)

# Evaluate log model
r2_log = r2_score(y_log, y_log_pred)
adj_r2_log = 1 - (1 - r2_log) * (n - 1) / (n - k - 1)

print("="*50)
print("LOG-TRANSFORMED REGRESSION RESULTS")
print("="*50)
print(f"\nLog-transformed equation:")
print(f"log(Per Capita GDP) = {model_log.coef_[0]:.4f} × log(Population) + {model_log.intercept_:.4f}")
print(f"\nModel Performance:")
print(f"  - R-squared (R²): {r2_log:.4f}")
print(f"  - Adjusted R²: {adj_r2_log:.4f}")
print(f"\nComparison with linear model:")
print(f"  - Linear R²: {r2:.4f}")
print(f"  - Log-transformed R²: {r2_log:.4f}")
print(f"  - Improvement: {(r2_log - r2)*100:.1f} percentage points")

In [None]:
# Visualize both models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Linear model
ax1.scatter(X, y, alpha=0.6, s=50, label='Data')
ax1.plot(X, y_pred, color='red', linewidth=2, label=f'Linear fit (R²={r2:.3f})')
ax1.set_xlabel('Mean Population')
ax1.set_ylabel('Mean Per Capita GDP')
ax1.set_title('Linear Model')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Log-transformed model
ax2.scatter(X_log, y_log, alpha=0.6, s=50, label='Log-transformed data')
ax2.plot(X_log, y_log_pred, color='red', linewidth=2, label=f'Linear fit (R²={r2_log:.3f})')
ax2.set_xlabel('log(Mean Population)')
ax2.set_ylabel('log(Mean Per Capita GDP)')
ax2.set_title('Log-transformed Model')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary and Conclusions

### Task A - Correlation Analysis Summary:
- Calculated the Pearson correlation coefficient between mean population and mean per capita GDP
- Visualized the relationship using scatter plots (both linear and log scales)
- Interpreted the strength and direction of the correlation

### Task B - Linear Regression Summary:
- Performed linear regression with population as independent variable and per capita GDP as dependent variable
- Evaluated model performance using R², MSE, and RMSE
- Conducted residual analysis to check model assumptions
- Compared linear and log-transformed models

### Key Findings:
1. The correlation analysis reveals the nature of the relationship between population size and economic prosperity (per capita GDP)
2. The regression model quantifies this relationship and allows for prediction
3. Residual analysis helps identify any violations of regression assumptions
4. Log transformation may provide a better fit if the relationship is non-linear