####**GitHub–Colab Integration**
This section has a workflow for integrating Google Colab with the project's GitHub repository.

In [None]:
import os
from getpass import getpass

In [None]:
# GitHub config

GITHUB_USERNAME = "chiraagmishra"
REPO_NAME = "urban-technology-project"
GITHUB_EMAIL = "chiraag.cm@gmail.com"
GITHUB_NAME = "Chiraag Mishra"

In [None]:
repo_path = f"/content/{REPO_NAME}"

# Authenticate (token hidden)
token = getpass("Paste GitHub Personal Access Token: ")

# Clone repo with credentials
if not os.path.exists(repo_path):
    !git clone https://{GITHUB_USERNAME}:{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git
else:
    print("Repository already exists.")

# Navigate and configure
%cd {repo_path}

!git config --global user.email "{GITHUB_EMAIL}"
!git config --global user.name "{GITHUB_NAME}"
!git config --global --add safe.directory {repo_path}

print("GitHub set-up. Ready for commit & push from Colab.")

Paste GitHub Personal Access Token: ··········
Cloning into 'urban-technology-project'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 64 (delta 23), reused 32 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 4.73 MiB | 7.08 MiB/s, done.
Resolving deltas: 100% (23/23), done.
/content/urban-technology-project
GitHub set-up. Ready for commit & push from Colab.


#### **Imports and loads**

In [None]:
!pip install -q darts statsforecast

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.6/354.6 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.7/280.7 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import pickle
import os
import traceback
from datetime import datetime

from darts import TimeSeries
from darts.models import (
    AutoARIMA,
    LinearRegressionModel,
    RandomForest,
    LightGBMModel,
    XGBModel
)

In [None]:
df = pd.read_csv('data/processed/migration_labor_with_features.csv')

print(f"  Shape: {df.shape}")
print(f"  Period: {df['year'].min()}-{df['year'].max()}")
print(f"  States: {df['state'].nunique()}")

  Shape: (400, 13)
  Period: 2000-2024
  States: 16


In [None]:
missing = df.isnull().sum()
if missing.sum() > 0:
    print(f"\nMissing values detected:")
    print(missing[missing > 0])
    print("\nDropping rows with missing values...")
    df = df.dropna()
    print(f"New shape: {df.shape}")

df.head()

Unnamed: 0,state,year,migration_foreign,migration_foreign_male,migration_foreign_female,migration_german,migration_total,unemployment_rate,vacancies_total,vacancies_sc,unemployed_count,labor_market_tightness,vacancy_rate
0,Baden-Württemberg,2000,6418,-1142,7560,4702,11120,5.4,78669,75810,281500,0.269306,269.306326
1,Baden-Württemberg,2001,24903,10120,14783,-158,24745,4.9,73514,70418,264301,0.26643,266.430069
2,Baden-Württemberg,2002,18590,5913,12677,-1749,16841,5.4,57496,54136,295005,0.183508,183.508132
3,Baden-Württemberg,2003,8036,-11,8047,-3749,4287,6.2,37759,34494,336881,0.102392,102.391935
4,Baden-Württemberg,2004,3586,-2034,5620,-5576,-1990,6.2,29907,26861,340943,0.078784,78.784199


#### **Define covariates & prepare time series**

In [None]:
TARGET_COL = 'migration_foreign'

# Covariates (exogenous variables for hypothesis testing)
COVARIATE_COLS = [
    'unemployment_rate',        # H1, H2: Core labor market indicator
    'vacancies_sc',             # H1: Job demand signal
    'labor_market_tightness',   # H2: Key hypothesis variable
    'unemployed_count',         # H1, H2: Labor supply
    'vacancy_rate'              # Alternative tightness measure
]

TEST_SIZE = 5

In [None]:
# PREPARE TIME SERIES LISTS (One series per state)

def prepare_timeseries_lists(df, target_col, covariate_cols):
    """
    Create lists of TimeSeries objects for global model training

    Returns:
    --------
    target_list : list of TimeSeries
        One target series per state
    covariate_list : list of TimeSeries
        One covariate series per state (aligned with targets)
    state_names : list of str
        State names (for tracking)
    """
    target_list = []
    covariate_list = []
    state_names = []

    states = sorted(df['state'].unique())

    for state in states:
        state_data = df[df['state'] == state].sort_values('year').copy()

        if len(state_data) < 10:
            print(f"Skipping {state}: insufficient data ({len(state_data)} years)")
            continue

        state_data['year_dt'] = pd.to_datetime(state_data['year'], format='%Y')

        # Target TimeSeries
        target_series = TimeSeries.from_dataframe(
            state_data,
            time_col='year_dt',
            value_cols=target_col,
            freq='YS'
        )

        # Covariate TimeSeries
        cov_series = TimeSeries.from_dataframe(
            state_data,
            time_col='year_dt',
            value_cols=covariate_cols,
            freq='YS'
        )

        target_list.append(target_series)
        covariate_list.append(cov_series)
        state_names.append(state)

        print(f"{state:<25} : {len(target_series)} years")

    print(f"\nPrepared {len(target_list)} states for training")
    print(f"  Total data points: {sum(len(ts) for ts in target_list)}")

    return target_list, covariate_list, state_names

In [None]:
target_list, covariate_list, state_names = prepare_timeseries_lists(
    df, TARGET_COL, COVARIATE_COLS
)

Baden-Württemberg         : 25 years
Bayern                    : 25 years
Berlin                    : 25 years
Brandenburg               : 25 years
Bremen                    : 25 years
Hamburg                   : 25 years
Hessen                    : 25 years
Mecklenburg-Vorpommern    : 25 years
Niedersachsen             : 25 years
Nordrhein-Westfalen       : 25 years
Rheinland-Pfalz           : 25 years
Saarland                  : 25 years
Sachsen                   : 25 years
Sachsen-Anhalt            : 25 years
Schleswig-Holstein        : 25 years
Thüringen                 : 25 years

Prepared 16 states for training
  Total data points: 400


#### **Train-test split**

In [None]:
def split_timeseries_lists(target_list, covariate_list, test_size=5):
    """
    Split lists of TimeSeries into train and test sets
    We split each state's series at the same point (global model)

    Parameters:
    -----------
    target_list : list of TimeSeries
    covariate_list : list of TimeSeries
    test_size : int
        Number of years to reserve for testing

    Returns:
    --------
    train_targets : list of TimeSeries
    test_targets : list of TimeSeries
    train_covariates : list of TimeSeries
    test_covariates : list of TimeSeries
    """
    train_targets = []
    test_targets = []
    train_covariates = []
    test_covariates = []

    for i, (target, cov) in enumerate(zip(target_list, covariate_list)):
        # Split each state's series
        train_target = target[:-test_size]
        test_target = target[-test_size:]

        train_cov = cov[:-test_size]
        test_cov = cov[-test_size:]

        train_targets.append(train_target)
        test_targets.append(test_target)
        train_covariates.append(train_cov)
        test_covariates.append(test_cov)

    # Verify split
    print(f"Split {len(target_list)} states")
    print(f"\nPer state:")
    print(f"  Training years: {len(train_targets[0])}")
    print(f"  Test years: {len(test_targets[0])}")

    test_years = test_targets[0].time_index.year.tolist()
    print(f"\nTest period: {test_years}")

    # Total data points
    total_train_points = sum(len(ts) for ts in train_targets)
    total_test_points = sum(len(ts) for ts in test_targets)

    print(f"\nTotal data points:")
    print(f"  Training: {total_train_points} ({len(train_targets)} states × {len(train_targets[0])} years)")
    print(f"  Test: {total_test_points} ({len(test_targets)} states × {len(test_targets[0])} years)")

    return train_targets, test_targets, train_covariates, test_covariates

In [None]:
train_targets, test_targets, train_covariates, test_covariates = split_timeseries_lists(
    target_list, covariate_list, test_size=TEST_SIZE
)

Split 16 states

Per state:
  Training years: 20
  Test years: 5

Test period: [2020, 2021, 2022, 2023, 2024]

Total data points:
  Training: 320 (16 states × 20 years)
  Test: 80 (16 states × 5 years)


#### **Model Training Setup**

In [None]:
os.makedirs('models', exist_ok=True)

# To store trained models
trained_models = {}

# To store predictions
predictions = {
    'train': {},  # In-sample predictions
    'test': {}    # Out-of-sample forecasts
}

# Number of forecast steps
n_forecast = len(test_targets[0])

#### **Baseline Models (No Labor Market Covariates)**

Train on concatenated series (average across states). Or, train one AutoARIMA per state and average predictions.

In [None]:
# BASELINE 1: Naive (Persistence Model)
# Assumption: y_t+1 = y_t (tomorrow = today)
try:
    baseline_predictions_test_naive = []

    for train_target, test_target in zip(train_targets, test_targets):
        # Repeat last training value for all forecast steps
        last_value = train_target.last_value()

        naive_pred = TimeSeries.from_times_and_values(
            times=test_target.time_index,
            values=last_value * np.ones(n_forecast)
        )
        baseline_predictions_test_naive.append(naive_pred)

    # Store
    trained_models['Naive'] = 'persistence_model'
    predictions['test']['Naive'] = baseline_predictions_test_naive

except Exception as e:
    print(f"Naive baseline failed: {e}")
    traceback.print_exc()


In [None]:
# BASELINE 2: AutoARIMA (Univariate Model)
try:
    baseline_predictions_test_arima = []

    for i, (train_target, test_target) in enumerate(zip(train_targets, test_targets)):
        # Train AutoARIMA on this state
        model_arima = AutoARIMA(
            start_p=1,
            max_p=3,
            start_q=1,
            max_q=3,
            max_d=2,
            seasonal=False,
            stepwise=True,
            trace=False,
            random_state=42
        )

        model_arima.fit(train_target)

        # Predict
        pred = model_arima.predict(n=n_forecast)
        baseline_predictions_test_arima.append(pred)

        if (i + 1) % 5 == 0:
            print(f"  Completed {i + 1}/{len(train_targets)} states")

    # Store
    trained_models['AutoARIMA'] = 'per_state_models'
    predictions['test']['AutoARIMA'] = baseline_predictions_test_arima

    print(f"AutoARIMA baseline complete")
    print(f"Trained {len(train_targets)} separate models (one per state)")

except Exception as e:
    print(f"AutoARIMA failed: {e}")
    traceback.print_exc()

  Completed 5/16 states
  Completed 10/16 states
  Completed 15/16 states
AutoARIMA baseline complete
Trained 16 separate models (one per state)


#### **Global models with covariates**

Training ONE model per algorithm on ALL states simultaneously

In [None]:
# MODEL 3: Linear Regression (Global)
try:
    model_lr = LinearRegressionModel(
        lags=3,                      # Use 3 past values of target
        lags_future_covariates=[0],  # Use current year's labor market data
        output_chunk_length=1        # Predict 1 step at a time
    )

    print("Training on all states...")
    model_lr.fit(train_targets, future_covariates=train_covariates)

    print("Generating predictions...")
    pred_lr = model_lr.predict(
        n=n_forecast,
        series=train_targets,
        future_covariates=test_covariates
    )

    # Store
    trained_models['LinearReg'] = model_lr
    predictions['test']['LinearReg'] = pred_lr

    print(f"Linear Regression complete")

except Exception as e:
    print(f"Linear Regression failed: {e}")
    traceback.print_exc()

Training on all states...
Generating predictions...
Linear Regression complete
Configuration: lags=3, covariates=5


In [None]:
# MODEL 4: Random Forest
try:
    model_rf = RandomForest(
        lags=3,
        lags_future_covariates=[0],
        output_chunk_length=1,
        n_estimators=100,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1                    # Use all CPU cores
    )

    print("Training on all states...")
    model_rf.fit(train_targets, future_covariates=train_covariates)

    print("Generating predictions...")
    pred_rf = model_rf.predict(
        n=n_forecast,
        series=train_targets,
        future_covariates=test_covariates
    )

    # Store
    trained_models['RandomForest'] = model_rf
    predictions['test']['RandomForest'] = pred_rf

    print(f"Random Forest complete")

except Exception as e:
    print(f"Random Forest failed: {e}")
    traceback.print_exc()



Training on all states...
Generating predictions...
Random Forest complete


In [None]:
# MODEL 5: XGBoost
try:
    model_xgb = XGBModel(
        lags=3,
        lags_future_covariates=[0],
        output_chunk_length=1,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    print("Training on all states...")
    model_xgb.fit(train_targets, future_covariates=train_covariates)

    print("Generating predictions...")
    pred_xgb = model_xgb.predict(
        n=n_forecast,
        series=train_targets,
        future_covariates=test_covariates
    )

    # Store
    trained_models['XGBoost'] = model_xgb
    predictions['test']['XGBoost'] = pred_xgb

    print(f"XGBoost complete")

except Exception as e:
    print(f"XGBoost failed: {e}")
    traceback.print_exc()

Training on all states...
Generating predictions...
XGBoost complete


In [None]:
# MODEL 6: LightGBM (Global)
try:
    model_lgb = LightGBMModel(
        lags=3,
        lags_future_covariates=[0],
        output_chunk_length=1,
        n_estimators=100,
        max_depth=5,
        num_leaves=31,
        learning_rate=0.1,
        min_child_samples=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    )

    print("Training on all states...")
    model_lgb.fit(train_targets, future_covariates=train_covariates)

    print("Generating predictions...")
    pred_lgb = model_lgb.predict(
        n=n_forecast,
        series=train_targets,
        future_covariates=test_covariates
    )

    # Store
    trained_models['LightGBM'] = model_lgb
    predictions['test']['LightGBM'] = pred_lgb

    print(f"LightGBM complete")
    print(f"Configuration: lags=3, n_estimators=100, max_depth=5")

except Exception as e:
    print(f"LightGBM failed: {e}")
    traceback.print_exc()

Training on all states...
Generating predictions...
LightGBM complete
Configuration: lags=3, n_estimators=100, max_depth=5




#### **Save global models**

In [None]:
os.makedirs('models', exist_ok=True)
os.makedirs('results/predictions', exist_ok=True)

In [None]:
# Save models
for model_name, model_obj in trained_models.items():
    if isinstance(model_obj, str):
        # Skip non-picklable markers (e.g., 'per_state_models', 'persistence_model')
        print(f"{model_name}: {model_obj} (not saved - simple baseline)")
        continue

    try:
        model_path = f'models/{model_name}_global.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(model_obj, f)
        print(f"{model_name}: {model_path}")
    except Exception as e:
        print(f"{model_name}: Failed to save - {e}")

AutoARIMA: per_state_models (not saved - simple baseline)
Naive: persistence_model (not saved - simple baseline)
LinearReg: models/LinearReg_global.pkl
RandomForest: models/RandomForest_global.pkl
XGBoost: models/XGBoost_global.pkl
LightGBM: models/LightGBM_global.pkl


In [None]:
# Save Predictions (Test Set)

# Saved as pickle (to preserve TimeSeries objects)
predictions_path = 'results/predictions/test_predictions.pkl'
with open(predictions_path, 'wb') as f:
    pickle.dump(predictions['test'], f)
print(f"Test predictions: {predictions_path}")

Test predictions: results/predictions/test_predictions.pkl


In [None]:
# Save state names for reference
state_info = {
    'state_names': state_names,
    'test_years': test_targets[0].time_index.year.tolist(),
    'n_states': len(state_names),
    'n_forecast': n_forecast
}

state_info_path = 'results/predictions/state_info.pkl'
with open(state_info_path, 'wb') as f:
    pickle.dump(state_info, f)
print(f"State info: {state_info_path}")

State info: results/predictions/state_info.pkl


In [None]:
# Save Predictions as CSV (for easy inspection)
prediction_records = []

for model_name, pred_list in predictions['test'].items():
    for i, (state, pred_series) in enumerate(zip(state_names, pred_list)):
        for time_idx, value in zip(pred_series.time_index, pred_series.values()):
            prediction_records.append({
                'state': state,
                'year': time_idx.year,
                'model': model_name,
                'predicted_migration': value.item()
            })

df_predictions = pd.DataFrame(prediction_records)

# Actual values for comparison
actual_records = []
for i, (state, test_series) in enumerate(zip(state_names, test_targets)):
    for time_idx, value in zip(test_series.time_index, test_series.values()):
        actual_records.append({
            'state': state,
            'year': time_idx.year,
            'actual_migration': value.item()
        })

df_actual = pd.DataFrame(actual_records)

# Merge predictions with actuals
df_predictions_full = df_predictions.merge(
    df_actual,
    on=['state', 'year'],
    how='left'
)

csv_path = 'results/predictions/predictions_vs_actual.csv'
df_predictions_full.to_csv(csv_path, index=False)
print(f"CSV format: {csv_path}")

print("\nSample predictions (first state, first year):")
sample = df_predictions_full[
    (df_predictions_full['state'] == state_names[0]) &
    (df_predictions_full['year'] == test_targets[0].time_index.year[0])
]
print(sample.to_string(index=False))

CSV format: results/predictions/predictions_vs_actual.csv

Sample predictions (first state, first year):
            state  year        model  predicted_migration  actual_migration
Baden-Württemberg  2020    AutoARIMA         55147.000000           32258.0
Baden-Württemberg  2020        Naive         55147.000000           32258.0
Baden-Württemberg  2020    LinearReg         55530.823471           32258.0
Baden-Württemberg  2020 RandomForest         82324.599963           32258.0
Baden-Württemberg  2020      XGBoost         76441.390625           32258.0
Baden-Württemberg  2020     LightGBM        117725.160371           32258.0


In [None]:
# Summary
print(f"\nModels Trained:")
for model_name in trained_models.keys():
    print(f"- {model_name}")

print(f"\nOutputs Saved:")
print(f"  • models/*.pkl ({len([m for m in trained_models.values() if not isinstance(m, str)])} files)")
print(f"  • results/predictions/test_predictions.pkl")
print(f"  • results/predictions/state_info.pkl")
print(f"  • results/predictions/predictions_vs_actual.csv")

print(f"\nPrediction Details:")
print(f"  - States: {len(state_names)}")
print(f"  - Test period: {test_targets[0].time_index.year[0]}-{test_targets[0].time_index.year[-1]}")
print(f"  - Forecast horizon: {n_forecast} years")
print(f"  - Total predictions per model: {len(state_names) * n_forecast}")


Models Trained:
- AutoARIMA
- Naive
- LinearReg
- RandomForest
- XGBoost
- LightGBM

Outputs Saved:
  • models/*.pkl (4 files)
  • results/predictions/test_predictions.pkl
  • results/predictions/state_info.pkl
  • results/predictions/predictions_vs_actual.csv

Prediction Details:
  - States: 16
  - Test period: 2020-2024
  - Forecast horizon: 5 years
  - Total predictions per model: 80
