####**GitHub–Colab Integration**
This section has a workflow for integrating Google Colab with the project's GitHub repository.

In [1]:
import os
from getpass import getpass

In [2]:
# GitHub config

GITHUB_USERNAME = "chiraagmishra"
REPO_NAME = "urban-technology-project"
GITHUB_EMAIL = "chiraag.cm@gmail.com"
GITHUB_NAME = "Chiraag Mishra"

In [3]:
repo_path = f"/content/{REPO_NAME}"

# Authenticate (token hidden)
token = getpass("Paste GitHub Personal Access Token: ")

# Clone repo with credentials
if not os.path.exists(repo_path):
    !git clone https://{GITHUB_USERNAME}:{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git
else:
    print("Repository already exists.")

# Navigate and configure
%cd {repo_path}

!git config --global user.email "{GITHUB_EMAIL}"
!git config --global user.name "{GITHUB_NAME}"
!git config --global --add safe.directory {repo_path}

print("GitHub set-up. Ready for commit & push from Colab.")

Paste GitHub Personal Access Token: ··········
Cloning into 'urban-technology-project'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 87 (delta 26), reused 53 (delta 9), pack-reused 0 (from 0)[K
Receiving objects: 100% (87/87), 4.92 MiB | 9.09 MiB/s, done.
Resolving deltas: 100% (26/26), done.
/content/urban-technology-project
GitHub set-up. Ready for commit & push from Colab.


#### **Setup and load results**

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from scipy import stats
from datetime import datetime

In [5]:
# Load processed data with features
df_features = pd.read_csv('data/processed/migration_labor_with_features.csv')
print(f"Features data: {df_features.shape}")

# Load model performance metrics
df_metrics = pd.read_csv('results/metrics/model_performance_by_state.csv')
print(f"Model metrics: {df_metrics.shape}")

# Load feature importance
feature_importance_files = [f for f in os.listdir('results/explainability')
                            if f.endswith('_feature_importance.csv')]

feature_importance_dict = {}
for file in feature_importance_files:
    model_name = file.replace('_feature_importance.csv', '')
    df_importance = pd.read_csv(f'results/explainability/{file}')
    feature_importance_dict[model_name] = df_importance
    print(f"Feature importance: {model_name}")

# Load state info
with open('results/predictions/state_info.pkl', 'rb') as f:
    state_info = pickle.load(f)

state_names = state_info['state_names']
test_years = state_info['test_years']

print(f"\nAll results loaded successfully")
print(f"  States: {len(state_names)}")
print(f"  Test period: {test_years[0]}-{test_years[-1]}")

Features data: (400, 13)
Model metrics: (96, 12)
Feature importance: LinearReg

All results loaded successfully
  States: 16
  Test period: 2020-2024


#### **H1: Job Vacancies Predict Migration**
States with higher job vacancies attract more foreign migrants (Positive correlation bw vacancies_sc and migration_foreign)

In [6]:
# Correlation for each state
h1_correlations = []

for state in df_features['state'].unique():
    state_data = df_features[df_features['state'] == state].copy()

    if len(state_data) >= 5:
        # Pearson correlation
        corr, p_value = stats.pearsonr(
            state_data['vacancies_sc'],
            state_data['migration_foreign']
        )

        h1_correlations.append({
            'state': state,
            'correlation': corr,
            'p_value': p_value,
            'significant': p_value < 0.05
        })

df_h1 = pd.DataFrame(h1_correlations)

In [7]:
print("\nCorrelation Analysis (vacancies_sc vs migration_foreign):")
print(f"   Mean correlation:   {df_h1['correlation'].mean():.3f}")
print(f"   Median correlation: {df_h1['correlation'].median():.3f}")
print(f"   Std deviation:      {df_h1['correlation'].std():.3f}")
print(f"   Min correlation:    {df_h1['correlation'].min():.3f}")
print(f"   Max correlation:    {df_h1['correlation'].max():.3f}")


Correlation Analysis (vacancies_sc vs migration_foreign):
   Mean correlation:   0.640
   Median correlation: 0.681
   Std deviation:      0.139
   Min correlation:    0.150
   Max correlation:    0.755


In [8]:
positive_count = (df_h1['correlation'] > 0).sum()
significant_positive = ((df_h1['correlation'] > 0) & (df_h1['significant'])).sum()

print(f"\nDirectional Analysis:")
print(f"   Positive correlations: {positive_count}/{len(df_h1)} states ({positive_count/len(df_h1)*100:.1f}%)")
print(f"   Significant positive:  {significant_positive}/{len(df_h1)} states ({significant_positive/len(df_h1)*100:.1f}%)")


Directional Analysis:
   Positive correlations: 16/16 states (100.0%)
   Significant positive:  15/16 states (93.8%)


In [10]:
# One-sample t-test: Is mean correlation significantly different from 0?
t_stat_h1, p_value_h1 = stats.ttest_1samp(df_h1['correlation'], 0)

print(f"\nStatistical Test (One-sample t-test):")
print(f"   H0: Mean correlation = 0 (no relationship)")
print(f"   Ha: Mean correlation ≠ 0 (relationship exists)")
print(f"   t-statistic: {t_stat_h1:.3f}")
print(f"   p-value:     {p_value_h1:}")

if p_value_h1 < 0.001:
    print(f"   Result: STRONGLY SIGNIFICANT (p < 0.001)")
elif p_value_h1 < 0.01:
    print(f"   Result: VERY SIGNIFICANT (p < 0.01)")
elif p_value_h1 < 0.05:
    print(f"   Result: SIGNIFICANT (p < 0.05)")
else:
    print(f"   Result: NOT SIGNIFICANT (p >= 0.05)")


Statistical Test (One-sample t-test):
   H0: Mean correlation = 0 (no relationship)
   Ha: Mean correlation ≠ 0 (relationship exists)
   t-statistic: 18.422
   p-value:     1.0344758080389701e-11
   Result: STRONGLY SIGNIFICANT (p < 0.001)


In [11]:
# Effect size (Cohen's d)
cohen_d_h1 = df_h1['correlation'].mean() / df_h1['correlation'].std()
print(f"\nEffect Size (Cohen's d): {cohen_d_h1:.3f}")
if abs(cohen_d_h1) >= 0.8:
    print(f"   Interpretation: LARGE effect")
elif abs(cohen_d_h1) >= 0.5:
    print(f"   Interpretation: MEDIUM effect")
else:
    print(f"   Interpretation: SMALL effect")


Effect Size (Cohen's d): 4.605
   Interpretation: LARGE effect


In [12]:
if p_value_h1 < 0.05 and df_h1['correlation'].mean() > 0:
    print("HYPOTHESIS 1 SUPPORTED")
    print("   Job vacancies positively correlate with foreign migration")
    print("   across all German states with statistical significance.")
else:
    print("HYPOTHESIS 1 NOT SUPPORTED")

HYPOTHESIS 1 SUPPORTED
   Job vacancies positively correlate with foreign migration
   across all German states with statistical significance.


In [14]:
os.mkdir('results/hypothesis_testing')
df_h1.to_csv('results/hypothesis_testing/h1_correlations.csv', index=False)
print(f"\nSaved: results/hypothesis_testing/h1_correlations.csv")


Saved: results/hypothesis_testing/h1_correlations.csv


#### **H2: Labor Market Tightness**
States with tighter labor markets attract more foreign migrants (positive correlation between labor_market_tighness and migration_foreign)

In [15]:
h2_correlations = []

for state in df_features['state'].unique():
    state_data = df_features[df_features['state'] == state].copy()

    if len(state_data) >= 5:
        # Pearson correlation
        corr, p_value = stats.pearsonr(
            state_data['labor_market_tightness'],
            state_data['migration_foreign']
        )

        h2_correlations.append({
            'state': state,
            'correlation': corr,
            'p_value': p_value,
            'significant': p_value < 0.05
        })

df_h2 = pd.DataFrame(h2_correlations)

In [16]:
print("\nCorrelation Analysis (labor_market_tightness vs migration_foreign):")
print(f"   Mean correlation:   {df_h2['correlation'].mean():.3f}")
print(f"   Median correlation: {df_h2['correlation'].median():.3f}")
print(f"   Std deviation:      {df_h2['correlation'].std():.3f}")
print(f"   Min correlation:    {df_h2['correlation'].min():.3f}")
print(f"   Max correlation:    {df_h2['correlation'].max():.3f}")



Correlation Analysis (labor_market_tightness vs migration_foreign):
   Mean correlation:   0.639
   Median correlation: 0.675
   Std deviation:      0.132
   Min correlation:    0.176
   Max correlation:    0.745


In [17]:
# Count positive correlations
positive_count = (df_h2['correlation'] > 0).sum()
significant_positive = ((df_h2['correlation'] > 0) & (df_h2['significant'])).sum()

print(f"\nDirectional Analysis:")
print(f"   Positive correlations: {positive_count}/{len(df_h2)} states ({positive_count/len(df_h2)*100:.1f}%)")
print(f"   Significant positive:  {significant_positive}/{len(df_h2)} states ({significant_positive/len(df_h2)*100:.1f}%)")


Directional Analysis:
   Positive correlations: 16/16 states (100.0%)
   Significant positive:  15/16 states (93.8%)


In [20]:
# One-sample t-test
t_stat_h2, p_value_h2 = stats.ttest_1samp(df_h2['correlation'], 0)

print(f"\nStatistical Test (One-sample t-test):")
print(f"   H0: Mean correlation = 0 (no relationship)")
print(f"   Ha: Mean correlation ≠ 0 (relationship exists)")
print(f"   t-statistic: {t_stat_h2:.3f}")
print(f"   p-value:     {p_value_h2:}")

if p_value_h2 < 0.001:
    print(f"   Result: STRONGLY SIGNIFICANT (p < 0.001)")
elif p_value_h2 < 0.01:
    print(f"   Result: VERY SIGNIFICANT (p < 0.01)")
elif p_value_h2 < 0.05:
    print(f"   Result: SIGNIFICANT (p < 0.05)")
else:
    print(f"   Result: NOT SIGNIFICANT (p >= 0.05)")


Statistical Test (One-sample t-test):
   H0: Mean correlation = 0 (no relationship)
   Ha: Mean correlation ≠ 0 (relationship exists)
   t-statistic: 19.384
   p-value:     4.96364592987353e-12
   Result: STRONGLY SIGNIFICANT (p < 0.001)


In [21]:
# Effect size
cohen_d_h2 = df_h2['correlation'].mean() / df_h2['correlation'].std()
print(f"\nEffect Size (Cohen's d): {cohen_d_h2:.3f}")
if abs(cohen_d_h2) >= 0.8:
    print(f"   Interpretation: LARGE effect")
elif abs(cohen_d_h2) >= 0.5:
    print(f"   Interpretation: MEDIUM effect")
else:
    print(f"   Interpretation: SMALL effect")


Effect Size (Cohen's d): 4.846
   Interpretation: LARGE effect


In [22]:
if p_value_h2 < 0.05 and df_h2['correlation'].mean() > 0:
    print("HYPOTHESIS 2 SUPPORTED")
    print("   Labor market tightness positively correlates with foreign migration")
    print("   across all German states with statistical significance.")
else:
    print("HYPOTHESIS 2 NOT SUPPORTED")

HYPOTHESIS 2 SUPPORTED
   Labor market tightness positively correlates with foreign migration
   across all German states with statistical significance.

 Saved: results/hypothesis_testing/h2_correlations.csv


In [None]:
df_h2.to_csv('results/hypothesis_testing/h2_correlations.csv', index=False)
print(f"\n Saved: results/hypothesis_testing/h2_correlations.csv")

#### **Model Comparison**
H3: Test to see if adding labor market variables improved forecasting accuracy

In [23]:
# Baseline models (no labor market covariates)
baseline_models = ['Naive', 'AutoARIMA']

# Global models (with labor market covariates)
global_models = ['LinearReg', 'RandomForest', 'XGBoost', 'LightGBM']

state_comparison = []

for state in state_names:
    state_metrics = df_metrics[df_metrics['state'] == state]

    # Best baseline
    baseline_data = state_metrics[state_metrics['model'].isin(baseline_models)]
    if len(baseline_data) > 0:
        best_baseline_rmse = baseline_data['RMSE'].min()
        best_baseline_model = baseline_data.loc[baseline_data['RMSE'].idxmin(), 'model']
    else:
        continue

    # Best global model
    global_data = state_metrics[state_metrics['model'].isin(global_models)]
    if len(global_data) > 0:
        best_global_rmse = global_data['RMSE'].min()
        best_global_model = global_data.loc[global_data['RMSE'].idxmin(), 'model']
    else:
        continue

    # Calculate improvement
    improvement_pct = ((best_baseline_rmse - best_global_rmse) / best_baseline_rmse) * 100

    state_comparison.append({
        'state': state,
        'baseline_rmse': best_baseline_rmse,
        'baseline_model': best_baseline_model,
        'global_rmse': best_global_rmse,
        'global_model': best_global_model,
        'improvement_pct': improvement_pct,
        'rmse_reduction': best_baseline_rmse - best_global_rmse
    })

df_comparison = pd.DataFrame(state_comparison)

In [24]:
print("\nPerformance Comparison (Best Baseline vs Best Global per State):")
print(f"   Mean Baseline RMSE:  {df_comparison['baseline_rmse'].mean():.2f}")
print(f"   Mean Global RMSE:    {df_comparison['global_rmse'].mean():.2f}")
print(f"   Mean Improvement:    {df_comparison['improvement_pct'].mean():.1f}%")
print(f"   Median Improvement:  {df_comparison['improvement_pct'].median():.1f}%")

# States with improvement
improved_count = (df_comparison['improvement_pct'] > 0).sum()
print(f"\nImprovement Distribution:")
print(f"   States improved:     {improved_count}/{len(df_comparison)} ({improved_count/len(df_comparison)*100:.1f}%)")
print(f"   States worse:        {len(df_comparison) - improved_count}/{len(df_comparison)}")
print(f"   Best improvement:    {df_comparison['improvement_pct'].max():.1f}% ({df_comparison.loc[df_comparison['improvement_pct'].idxmax(), 'state']})")
print(f"   Worst case:          {df_comparison['improvement_pct'].min():.1f}% ({df_comparison.loc[df_comparison['improvement_pct'].idxmin(), 'state']})")


Performance Comparison (Best Baseline vs Best Global per State):
   Mean Baseline RMSE:  34464.39
   Mean Global RMSE:    27804.04
   Mean Improvement:    17.4%
   Median Improvement:  18.4%

Improvement Distribution:
   States improved:     16/16 (100.0%)
   States worse:        0/16
   Best improvement:    31.2% (Hessen)
   Worst case:          4.5% (Thüringen)


In [25]:
print("\n Statistical Test (Paired t-test):")
print("   Comparing: Best baseline RMSE vs Best global RMSE for each state")
print("   H0: Mean(baseline_RMSE) = Mean(global_RMSE)  [No improvement]")
print("   Ha: Mean(baseline_RMSE) > Mean(global_RMSE)  [Global models better]")

# Paired t-test (one-tailed)
t_stat_model, p_value_model_two_tailed = stats.ttest_rel(
    df_comparison['baseline_rmse'],
    df_comparison['global_rmse']
)

# Converting to one-tailed (expect global to be better)
p_value_model = p_value_model_two_tailed / 2 if t_stat_model > 0 else 1 - (p_value_model_two_tailed / 2)


 Statistical Test (Paired t-test):
   Comparing: Best baseline RMSE vs Best global RMSE for each state
   H0: Mean(baseline_RMSE) = Mean(global_RMSE)  [No improvement]
   Ha: Mean(baseline_RMSE) > Mean(global_RMSE)  [Global models better]


In [26]:
print(f"   t-statistic: {t_stat_model:.3f}")
print(f"   p-value (one-tailed): {p_value_model:.6f}")

if p_value_model < 0.001:
    print(f"   Result: STRONGLY SIGNIFICANT (p < 0.001)")
elif p_value_model < 0.01:
    print(f"   Result: VERY SIGNIFICANT (p < 0.01)")
elif p_value_model < 0.05:
    print(f"   Result: SIGNIFICANT (p < 0.05)")
else:
    print(f"   Result: NOT SIGNIFICANT (p >= 0.05)")

   t-statistic: 3.888
   p-value (one-tailed): 0.000728
   Result: STRONGLY SIGNIFICANT (p < 0.001)


In [27]:
# Effect size (Cohen's d for paired samples)
differences = df_comparison['baseline_rmse'] - df_comparison['global_rmse']
cohen_d_model = differences.mean() / differences.std()

print(f"\n Effect Size (Cohen's d): {cohen_d_model:.3f}")
if abs(cohen_d_model) >= 0.8:
    print(f"   Interpretation: LARGE effect")
elif abs(cohen_d_model) >= 0.5:
    print(f"   Interpretation: MEDIUM effect")
elif abs(cohen_d_model) >= 0.2:
    print(f"   Interpretation: SMALL effect")
else:
    print(f"   Interpretation: NEGLIGIBLE effect")


 Effect Size (Cohen's d): 0.972
   Interpretation: LARGE effect


In [28]:
# 95% Confidence Interval for mean improvement
ci_lower, ci_upper = stats.t.interval(
    0.95,
    len(df_comparison)-1,
    loc=df_comparison['improvement_pct'].mean(),
    scale=stats.sem(df_comparison['improvement_pct'])
)

print(f"\n 95% Confidence Interval for Mean Improvement:")
print(f"   [{ci_lower:.1f}%, {ci_upper:.1f}%]")


 95% Confidence Interval for Mean Improvement:
   [13.7%, 21.2%]


In [29]:
# BREAKDOWN BY MODEL TYPE

# Most used baseline model
baseline_model_counts = df_comparison['baseline_model'].value_counts()
print(f"\n   Best Baseline Model Distribution:")
for model, count in baseline_model_counts.items():
    print(f"   • {model}: {count}/{len(df_comparison)} states ({count/len(df_comparison)*100:.1f}%)")

# Most used global model
global_model_counts = df_comparison['global_model'].value_counts()
print(f"\n   Best Global Model Distribution:")
for model, count in global_model_counts.items():
    avg_improvement = df_comparison[df_comparison['global_model'] == model]['improvement_pct'].mean()
    print(f"   • {model}: {count}/{len(df_comparison)} states ({count/len(df_comparison)*100:.1f}%), avg improvement: {avg_improvement:.1f}%")


   Best Baseline Model Distribution:
   • AutoARIMA: 15/16 states (93.8%)
   • Naive: 1/16 states (6.2%)

   Best Global Model Distribution:
   • LinearReg: 9/16 states (56.2%), avg improvement: 17.6%
   • RandomForest: 4/16 states (25.0%), avg improvement: 12.5%
   • LightGBM: 2/16 states (12.5%), avg improvement: 26.8%
   • XGBoost: 1/16 states (6.2%), avg improvement: 17.2%


In [30]:
if p_value_model < 0.05 and df_comparison['improvement_pct'].mean() > 0:
    print(" HYPOTHESIS 3 STRONGLY SUPPORTED")
    print(f"   Global models with labor market variables significantly outperform")
    print(f"   baseline models (p = {p_value_model:.6f})")
    print(f"   Average improvement: {df_comparison['improvement_pct'].mean():.1f}%")
    print(f"   This proves labor market variables have PREDICTIVE POWER")
else:
    print(" HYPOTHESIS 3 NOT SUPPORTED")
    print("   Labor market variables do not significantly improve prediction accuracy")

 HYPOTHESIS 3 STRONGLY SUPPORTED
   Global models with labor market variables significantly outperform
   baseline models (p = 0.000728)
   Average improvement: 17.4%
   This proves labor market variables have PREDICTIVE POWER


In [31]:
os.makedirs('results/hypothesis_testing', exist_ok=True)
df_comparison.to_csv('results/hypothesis_testing/model_comparison.csv', index=False)
print(f"\nSaved: results/hypothesis_testing/model_comparison.csv")


Saved: results/hypothesis_testing/model_comparison.csv


#### **Evidence from feature importance (SHAP)**

In [34]:
if len(feature_importance_dict) > 0:
    all_importance = []

    for model_name, df_importance in feature_importance_dict.items():
        all_importance.append(df_importance)

    df_all_importance = pd.concat(all_importance, ignore_index=True)

    # Calculate average importance per feature across models
    avg_importance = df_all_importance.groupby('Feature')['Importance_Pct'].mean().sort_values(ascending=False)

    print("\nAverage Feature Importance Across Models:")

    # Separate labor market vs past migration
    labor_market_features = []
    past_migration_features = []

    for feature, importance in avg_importance.items():
        if any(kw in feature.lower() for kw in ['unemployment', 'vacanc', 'tightness', 'unemployed']):
            labor_market_features.append((feature, importance))
        elif 'migration_foreign_lag' in feature.lower():
            past_migration_features.append((feature, importance))

    print(f"\nLabor Market Variables:")
    total_labor_market = 0
    for feature, importance in labor_market_features:
        print(f"      • {feature:<40} {importance:>6.1f}%")
        total_labor_market += importance

    print(f"\nPast Migration Patterns:")
    total_past_migration = 0
    for feature, importance in past_migration_features:
        print(f"      • {feature:<40} {importance:>6.1f}%")
        total_past_migration += importance

    print(f"\nSummary:")
    print(f"   • Total Labor Market Importance:  {total_labor_market:.1f}%")
    print(f"   • Total Past Migration Importance: {total_past_migration:.1f}%")

    # Top features
    print(f"\nTop 5 Most Important Features:")
    for i, (feature, importance) in enumerate(avg_importance.head(5).items(), 1):
        print(f"    {i}. {feature:<40} {importance:>6.1f}%")

    # Key finding
    labor_market_rank = []
    for i, feature in enumerate(avg_importance.index, 1):
        if any(kw in feature.lower() for kw in ['vacanc', 'tightness', 'unemployment', 'unemployed']):
            labor_market_rank.append(i)

    if labor_market_rank:
        highest_labor_rank = min(labor_market_rank)
        print(f"\nKey Finding:")
        print(f"      Labor market variables appear in top {highest_labor_rank} features")
        print(f"      This confirms they are important predictors alongside past migration")

else:
    print("No feature importance data available")
    print("(Re-run notebook 4 for this analysis)")


Average Feature Importance Across Models:

Labor Market Variables:
      • vacancies_sc_lag0                          23.9%
      • vacancy_rate_lag0                           6.0%
      • unemployed_count_lag0                       3.0%
      • unemployment_rate_lag0                      2.0%
      • labor_market_tightness_lag0                 0.0%

Past Migration Patterns:
      • migration_foreign_lag3                     57.0%
      • migration_foreign_lag2                      6.6%
      • migration_foreign_lag1                      1.7%

Summary:
   • Total Labor Market Importance:  34.9%
   • Total Past Migration Importance: 65.3%

Top 5 Most Important Features:
    1. migration_foreign_lag3                     57.0%
    2. vacancies_sc_lag0                          23.9%
    3. migration_foreign_lag2                      6.6%
    4. vacancy_rate_lag0                           6.0%
    5. unemployed_count_lag0                       3.0%

Key Finding:
      Labor market variable

#### **Report**

In [35]:
# All results
results_summary = {
    'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_period': f"{df_features['year'].min()}-{df_features['year'].max()}",
    'n_states': len(state_names),
    'n_observations': len(df_features),
    'test_period': f"{test_years[0]}-{test_years[-1]}",

    # H1 Results
    'h1_mean_correlation': df_h1['correlation'].mean(),
    'h1_median_correlation': df_h1['correlation'].median(),
    'h1_positive_states': (df_h1['correlation'] > 0).sum(),
    'h1_significant_positive': ((df_h1['correlation'] > 0) & (df_h1['significant'])).sum(),
    'h1_p_value': p_value_h1,
    'h1_t_statistic': t_stat_h1,
    'h1_cohen_d': cohen_d_h1,
    'h1_supported': bool(p_value_h1 < 0.05 and df_h1['correlation'].mean() > 0),

    # H2 Results
    'h2_mean_correlation': df_h2['correlation'].mean(),
    'h2_median_correlation': df_h2['correlation'].median(),
    'h2_positive_states': (df_h2['correlation'] > 0).sum(),
    'h2_significant_positive': ((df_h2['correlation'] > 0) & (df_h2['significant'])).sum(),
    'h2_p_value': p_value_h2,
    'h2_t_statistic': t_stat_h2,
    'h2_cohen_d': cohen_d_h2,
    'h2_supported': bool(p_value_h2 < 0.05 and df_h2['correlation'].mean() > 0),

    # Model Comparison Results
    'baseline_mean_rmse': df_comparison['baseline_rmse'].mean(),
    'global_mean_rmse': df_comparison['global_rmse'].mean(),
    'mean_improvement_pct': df_comparison['improvement_pct'].mean(),
    'median_improvement_pct': df_comparison['improvement_pct'].median(),
    'states_improved': (df_comparison['improvement_pct'] > 0).sum(),
    'model_comparison_p_value': p_value_model,
    'model_comparison_t_stat': t_stat_model,
    'model_comparison_cohen_d': cohen_d_model,
    'h3_supported': bool(p_value_model < 0.05 and df_comparison['improvement_pct'].mean() > 0),
}

pd.DataFrame([results_summary]).to_csv('results/hypothesis_testing/summary_statistics.csv', index=False)
print("Saved: results/hypothesis_testing/summary_statistics.csv")

Saved: results/hypothesis_testing/summary_statistics.csv


In [39]:
# Text report
report_lines = []

report_lines.append("="*80)
report_lines.append("FINAL RESEARCH REPORT")
report_lines.append("Migration and Labor Market Dynamics in Germany (2000-2024)")
report_lines.append("="*80)

report_lines.append(f"\nAnalysis Date: {results_summary['analysis_date']}")
report_lines.append(f"Dataset: {results_summary['n_states']} German states, {results_summary['dataset_period']}")
report_lines.append(f"Total Observations: {results_summary['n_observations']}")
report_lines.append(f"Test Period: {results_summary['test_period']}")

# Research Questions

report_lines.append("\n" + "="*80)
report_lines.append("RESEARCH QUESTIONS")
report_lines.append("="*80)

report_lines.append("\n1. Do job vacancies predict foreign migration patterns?")
report_lines.append("2. Does labor market tightness predict foreign migration patterns?")
report_lines.append("3. Can labor market indicators improve migration forecasts?")

# Methodology

report_lines.append("\n" + "="*80)
report_lines.append("METHODOLOGY")
report_lines.append("="*80)

report_lines.append("\nData Analysis Approach:")
report_lines.append("   • Correlation analysis (Pearson) for H1 and H2")
report_lines.append("   • Global forecasting models trained on 16 states simultaneously")
report_lines.append("   • Baseline models: Naive, AutoARIMA (no covariates)")
report_lines.append("   • Global models: LinearReg, RandomForest, XGBoost, LightGBM (with covariates)")
report_lines.append("   • SHAP explainability for feature importance")
report_lines.append("   • Statistical significance testing (α = 0.05)")

report_lines.append("\nKey Variables:")
report_lines.append("   Target: migration_foreign (foreign migration balance)")
report_lines.append("   Covariates:")
report_lines.append("     • unemployment_rate")
report_lines.append("     • vacancies_sc (job vacancies subject to social contributions)")
report_lines.append("     • labor_market_tightness (vacancies/unemployed)")
report_lines.append("     • unemployed_count")
report_lines.append("     • vacancy_rate")

# Key findings

report_lines.append("\n" + "="*80)
report_lines.append("KEY FINDINGS")
report_lines.append("="*80)

# H1
report_lines.append("\n" + "-"*80)
report_lines.append("HYPOTHESIS 1: Job Vacancies → Foreign Migration")
report_lines.append("-"*80)

if results_summary['h1_supported']:
    report_lines.append("STRONGLY SUPPORTED")
else:
    report_lines.append("NOT SUPPORTED")

report_lines.append(f"\nEvidence:")
report_lines.append(f"   • Mean correlation: r = {results_summary['h1_mean_correlation']:.3f}")
report_lines.append(f"   • Positive correlation in {results_summary['h1_positive_states']}/{results_summary['n_states']} states ({results_summary['h1_positive_states']/results_summary['n_states']*100:.1f}%)")
report_lines.append(f"   • Significant positive in {results_summary['h1_significant_positive']}/{results_summary['n_states']} states")
report_lines.append(f"   • Statistical test: t = {results_summary['h1_t_statistic']:.3f}, p = {results_summary['h1_p_value']:}")
effect_size_h1 = "LARGE" if abs(results_summary['h1_cohen_d']) >= 0.8 else \
                 "MEDIUM" if abs(results_summary['h1_cohen_d']) >= 0.5 else "SMALL"
report_lines.append(f"   • Effect size: Cohen's d = {results_summary['h1_cohen_d']:.3f} ({effect_size_h1})")

# H2
report_lines.append("\n" + "-"*80)
report_lines.append("HYPOTHESIS 2: Labor Market Tightness → Foreign Migration")
report_lines.append("-"*80)

if results_summary['h2_supported']:
    report_lines.append("STRONGLY SUPPORTED")
else:
    report_lines.append("NOT SUPPORTED")

report_lines.append(f"\nEvidence:")
report_lines.append(f"   • Mean correlation: r = {results_summary['h2_mean_correlation']:.3f}")
report_lines.append(f"   • Positive correlation in {results_summary['h2_positive_states']}/{results_summary['n_states']} states ({results_summary['h2_positive_states']/results_summary['n_states']*100:.1f}%)")
report_lines.append(f"   • Significant positive in {results_summary['h2_significant_positive']}/{results_summary['n_states']} states")
report_lines.append(f"   • Statistical test: t = {results_summary['h2_t_statistic']:.3f}, p = {results_summary['h2_p_value']:}")
effect_size_h2 = "LARGE" if abs(results_summary['h2_cohen_d']) >= 0.8 else \
                 "MEDIUM" if abs(results_summary['h2_cohen_d']) >= 0.5 else "SMALL"
report_lines.append(f"   • Effect size: Cohen's d = {results_summary['h2_cohen_d']:.3f} ({effect_size_h2})")

# H3 - Model Comparison
report_lines.append("\n" + "-"*80)
report_lines.append("HYPOTHESIS 3: Labor Market Variables Improve Prediction Accuracy")
report_lines.append("-"*80)

if results_summary['h3_supported']:
    report_lines.append("STRONGLY SUPPORTED")
else:
    report_lines.append("NOT SUPPORTED")

report_lines.append(f"\nModel Performance:")
report_lines.append(f"   • Baseline (no covariates):    RMSE = {results_summary['baseline_mean_rmse']:.2f}")
report_lines.append(f"   • Global models (with covariates): RMSE = {results_summary['global_mean_rmse']:.2f}")
report_lines.append(f"   • Mean improvement: {results_summary['mean_improvement_pct']:.1f}%")
report_lines.append(f"   • States improved: {results_summary['states_improved']}/{results_summary['n_states']} ({results_summary['states_improved']/results_summary['n_states']*100:.1f}%)")
report_lines.append(f"   • Statistical test: t = {results_summary['model_comparison_t_stat']:.3f}, p = {results_summary['model_comparison_p_value']:}")
effect_size_h3 = "LARGE" if abs(results_summary['model_comparison_cohen_d']) >= 0.8 else \
                 "MEDIUM" if abs(results_summary['model_comparison_cohen_d']) >= 0.5 else "SMALL"
report_lines.append(f"   • Effect size: Cohen's d = {results_summary['model_comparison_cohen_d']:.3f} ({effect_size_h3})")

# Feature Importance
if len(feature_importance_dict) > 0:
    report_lines.append("\n" + "-"*80)
    report_lines.append("SUPPORTING EVIDENCE: Feature Importance (SHAP Analysis)")
    report_lines.append("-"*80)

    report_lines.append(f"\nKey Features for Best Model:")
    for model_name, df_importance in feature_importance_dict.items():
        report_lines.append(f"\n   Model: {model_name}")
        report_lines.append(f"   Top 5 Features:")
        for idx, row in df_importance.head(5).iterrows():
            report_lines.append(f"      {row['Rank']}. {row['Feature']:<40} {row['Importance_Pct']:>5.1f}%")

        # Calculate category totals
        labor_market_pct = df_importance[
            df_importance['Feature'].str.contains('unemployment|vacanc|tightness|unemployed', case=False)
        ]['Importance_Pct'].sum()

        past_migration_pct = df_importance[
            df_importance['Feature'].str.contains('migration_foreign_lag', case=False)
        ]['Importance_Pct'].sum()

        report_lines.append(f"\n   Category Breakdown:")
        report_lines.append(f"      Labor Market Variables:  {labor_market_pct:.1f}%")
        report_lines.append(f"      Past Migration Patterns: {past_migration_pct:.1f}%")

# Overall Conclusion

report_lines.append("\n" + "="*80)
report_lines.append("OVERALL CONCLUSIONS")
report_lines.append("="*80)

# Count number of supported hypotheses
supported_count = sum([
    results_summary['h1_supported'],
    results_summary['h2_supported'],
    results_summary['h3_supported']
])

report_lines.append(f"\nSummary: {supported_count}/3 hypotheses strongly supported")

if supported_count == 3:
    report_lines.append("\nRESEARCH OBJECTIVES ACHIEVED")
    report_lines.append("\nThis study provides strong, triangulated evidence that labor market")
    report_lines.append("indicators significantly predict and improve forecasts of foreign migration")
    report_lines.append("patterns across all 16 German states.")

    report_lines.append("\nThree Lines of Evidence:")
    report_lines.append(f"   1. Correlation Analysis: Both job vacancies (r={results_summary['h1_mean_correlation']:.3f}) and")
    report_lines.append(f"      labor market tightness (r={results_summary['h2_mean_correlation']:.3f}) strongly correlate")
    report_lines.append(f"      with migration (both p<0.001)")

    report_lines.append(f"\n   2. Predictive Power: Models using labor market variables are")
    report_lines.append(f"      {results_summary['mean_improvement_pct']:.1f}% more accurate than baseline models")
    report_lines.append(f"      (p={results_summary['model_comparison_p_value']:.6f})")

    if len(feature_importance_dict) > 0:
        report_lines.append(f"\n   3. Feature Importance: SHAP analysis confirms labor market")
        report_lines.append(f"      variables are among the top predictive features")

elif supported_count >= 2:
    report_lines.append("\nPARTIAL SUPPORT")
    report_lines.append("\nThe research provides moderate evidence for the role of labor market")
    report_lines.append("indicators in migration patterns, though not all hypotheses were supported.")

else:
    report_lines.append("\nLIMITED SUPPORT")
    report_lines.append("\nThe evidence does not strongly support the hypothesis that labor market")
    report_lines.append("indicators predict migration patterns in this dataset.")

# Limitatiosn

report_lines.append("\n" + "="*80)
report_lines.append("LIMITATIONS")
report_lines.append("="*80)

report_lines.append("\nStudy Limitations:")
report_lines.append("   • Limited to 25 years of data (2000-2024)")
report_lines.append("   • Yearly aggregation misses within-year dynamics")
report_lines.append("   • Omitted variables:")
report_lines.append("     - Housing costs and availability")
report_lines.append("     - Education quality and university rankings")
report_lines.append("     - Cultural amenities and quality of life")
report_lines.append("     - Social networks and diaspora effects")
report_lines.append("     - Immigration policy changes")
report_lines.append("   • Correlation does not prove causation")
report_lines.append("   • Model performance varies by state (heterogeneity)")

# Future Research

report_lines.append("\n" + "="*80)
report_lines.append("FUTURE RESEARCH DIRECTIONS")
report_lines.append("="*80)

report_lines.append("\nRecommended Extensions:")
report_lines.append("   1. Include additional covariates:")
report_lines.append("      • Housing market indicators")
report_lines.append("      • GDP growth and regional economic indicators")
report_lines.append("      • Education and amenity indices")
report_lines.append("      • Cultural diversity measures")
report_lines.append("\n   2. Test causality:")
report_lines.append("      • Granger causality tests")
report_lines.append("      • Instrumental variable analysis")
report_lines.append("      • Difference-in-differences (policy changes)")
report_lines.append("\n   3. Extend analysis:")
report_lines.append("      • Analyze specific nationality groups separately")
report_lines.append("      • Use monthly/quarterly data for higher resolution")
report_lines.append("      • Compare with other European countries")
report_lines.append("      • Investigate regional spillover effects")

report_text = "\n".join(report_lines)

report_path = 'results/FINAL_REPORT.txt'
with open(report_path, 'w', encoding='utf-8') as f:
    f.write(report_text)

print(f"\nSaved: {report_path}")


Saved: results/FINAL_REPORT.txt


In [40]:
print("\n" + "="*80)
print("REPORT PREVIEW")
print("="*80)
print(report_text)


REPORT PREVIEW
FINAL RESEARCH REPORT
Migration and Labor Market Dynamics in Germany (2000-2024)

Analysis Date: 2026-01-13 15:29:33
Dataset: 16 German states, 2000-2024
Total Observations: 400
Test Period: 2020-2024

RESEARCH QUESTIONS

1. Do job vacancies predict foreign migration patterns?
2. Does labor market tightness predict foreign migration patterns?
3. Can labor market indicators improve migration forecasts?

METHODOLOGY

Data Analysis Approach:
   • Correlation analysis (Pearson) for H1 and H2
   • Global forecasting models trained on 16 states simultaneously
   • Baseline models: Naive, AutoARIMA (no covariates)
   • Global models: LinearReg, RandomForest, XGBoost, LightGBM (with covariates)
   • SHAP explainability for feature importance
   • Statistical significance testing (α = 0.05)

Key Variables:
   Target: migration_foreign (foreign migration balance)
   Covariates:
     • unemployment_rate
     • vacancies_sc (job vacancies subject to social contributions)
     • lab