<a href="https://colab.research.google.com/github/drstannwoji2019/ML_Projects/blob/main/TimeSeries_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Machine Learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Statistical models
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# -------------------------------
# 1. Data Loading and Preprocessing
# -------------------------------

# Load dataset
df = pd.read_csv("/Remittance_5EngSpngCountries.csv")

# Assume the first column is the date
df.iloc[:,0] = pd.to_datetime(df.iloc[:,0])
df.set_index(df.columns[0], inplace=True)
df.sort_index(inplace=True)

# Choose target variable: "Rem_Ghana" if exists else first column
target = "Rem_Total" if "Rem_Total" in df.columns else df.columns[0]

# -------------------------------
# 2. Prepare Data for Machine Learning Models
# -------------------------------
def create_lag_features(series, lags=3):
    df_features = pd.DataFrame()
    for lag in range(1, lags + 1):
        df_features[f'lag_{lag}'] = series.shift(lag)
    return df_features

lags = 3
lag_features = create_lag_features(df[target], lags)
df_ml = pd.concat([df[target], lag_features], axis=1).dropna()

# Split into train and test sets (last 20% as test)
train_size = int(0.8 * len(df_ml))
train = df_ml.iloc[:train_size]
test = df_ml.iloc[train_size:]
X_train = train.drop(columns=[target])
y_train = train[target]
X_test = test.drop(columns=[target])
y_test = test[target]

# -------------------------------
# 3. Fit Machine Learning Models
# -------------------------------
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42),  # Reduced n_estimators
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=50, max_depth=3, random_state=42),
    'SVR': SVR()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    results[name] = {"MSE": mse, "RMSE": rmse, "MAE": mae}

# -------------------------------
# 4. Fit Statistical Time Series Models (ARIMA and SARIMA)
# -------------------------------
# ARIMA model
arima_order = (1, 1, 1)
arima_model = ARIMA(df[target], order=arima_order)
arima_fit = arima_model.fit()
forecast_arima = arima_fit.predict(start=test.index[0], end=test.index[-1], dynamic=True)
mse_arima = mean_squared_error(test[target], forecast_arima)
rmse_arima = np.sqrt(mse_arima)
mae_arima = mean_absolute_error(test[target], forecast_arima)
results["ARIMA"] = {"MSE": mse_arima, "RMSE": rmse_arima, "MAE": mae_arima}

# SARIMA model: seasonal order (1, 1, 1, 12)
sarima_order = (1, 1, 1)
seasonal_order = (1, 1, 1, 12)
sarima_model = SARIMAX(df[target], order=sarima_order, seasonal_order=seasonal_order,
                        enforce_stationarity=False, enforce_invertibility=False)
sarima_fit = sarima_model.fit(disp=False)
forecast_sarima = sarima_fit.predict(start=test.index[0], end=test.index[-1], dynamic=True)
mse_sarima = mean_squared_error(test[target], forecast_sarima)
rmse_sarima = np.sqrt(mse_sarima)
mae_sarima = mean_absolute_error(test[target], forecast_sarima)
results["SARIMA"] = {"MSE": mse_sarima, "RMSE": rmse_sarima, "MAE": mae_sarima}

# -------------------------------
# 5. Final Results Table
# -------------------------------
results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})
results_df = results_df.sort_values(by="RMSE")
print("\n--- Final Model Comparison ---")
print(results_df)


1    1970-01-01 00:00:00.000002005
2    1970-01-01 00:00:00.000002006
3    1970-01-01 00:00:00.000002007
4    1970-01-01 00:00:00.000002008
5    1970-01-01 00:00:00.000002009
6    1970-01-01 00:00:00.000002010
7    1970-01-01 00:00:00.000002011
8    1970-01-01 00:00:00.000002012
9    1970-01-01 00:00:00.000002013
10   1970-01-01 00:00:00.000002014
11   1970-01-01 00:00:00.000002015
12   1970-01-01 00:00:00.000002016
13   1970-01-01 00:00:00.000002017
14   1970-01-01 00:00:00.000002018
15   1970-01-01 00:00:00.000002019
16   1970-01-01 00:00:00.000002020
17   1970-01-01 00:00:00.000002021
18   1970-01-01 00:00:00.000002022
Name:   Year, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[:,0] = pd.to_datetime(df.iloc[:,0])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters


--- Final Model Comparison ---
               Model           MSE          RMSE           MAE
2            XGBoost  6.661524e+18  2.580993e+09  2.354698e+09
1      Random Forest  8.425822e+18  2.902727e+09  2.442495e+09
3                SVR  9.744612e+18  3.121636e+09  2.656984e+09
0  Linear Regression  9.745320e+18  3.121749e+09  2.104831e+09
4              ARIMA  1.530499e+19  3.912160e+09  3.204629e+09
5             SARIMA  2.130437e+19  4.615666e+09  3.889952e+09


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
