<a href="https://colab.research.google.com/github/drstannwoji2019/SCM_Forecasting/blob/main/ForecastingModels_Ghana.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
data = pd.read_csv('/content/sample_data/FDI_Remittance_5EnglishSpeaking_WAfricanCountries.csv')
data.head()
print(data.head())
print(data.shape)
print(data.columns)
print(data.dtypes)
data.isnull().sum()
data.dropna(inplace=True)
data.duplicated().sum()
data.drop_duplicates(inplace=True)
data.tail()
print(data.tail())
data.info()
print(data.info())
data.describe()
print(data.describe())

# Prepare the data (lag features)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create lagged features for forecasting
def create_lagged_features(df, column, lags):
    for lag in range(1, lags + 1):
        df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    return df

# Load your dataset
file_path = '/content/sample_data/FDI_Remittance_5EnglishSpeaking_WAfricanCountries.csv'
data = pd.read_csv(file_path)

# Convert columns to numeric (remove commas)
data_cleaned = data.replace(',', '', regex=True).astype(float)

# Create lagged features for FDI and Remittances
lags = 3  # Using 3 previous time steps as features
data_lagged = create_lagged_features(data_cleaned, 'FDI_Ghana', lags)
data_lagged = create_lagged_features(data_lagged, 'Rem_Ghana', lags)

# Drop rows with missing values due to lagging
data_lagged.dropna(inplace=True)

# Features (X) and Target (y)
X = data_lagged[[f'FDI_Ghana_lag_{i}' for i in range(1, lags + 1)] +
                [f'Rem_Ghana_lag_{i}' for i in range(1, lags + 1)]]
y = data_lagged['Rem_Ghana']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


   Year  FDI_Gambia      FDI_Ghana  FDI_Liberia    FDI_Nigeria      FDI_SL  \
0  2004  55,526,319    139,270,000   75,351,732  1,874,060,887  61,153,314   
1  2005  53,650,280    144,970,000   82,802,111  4,982,533,930  90,731,670   
2  2006  82,208,103    636,010,000  107,856,672  4,854,353,979  58,869,144   
3  2007  78,094,821  1,383,177,930  131,637,662  6,036,021,405  95,470,171   
4  2008  70,792,382  2,714,916,344  283,536,077  8,194,071,895  53,095,068   

        FDI_Total  Rem_Gambia    Rem_Ghana Rem_Liberia     Rem_Nigeria  \
0   2,205,362,252  60,748,749   82,371,200  58,446,876   2,272,734,507   
1   5,354,687,991  59,304,088   99,184,576  31,854,956  14,640,084,310   
2   5,739,297,897  63,773,517  105,253,248  78,814,942  16,932,144,079   
3   7,724,401,989  55,662,540  117,363,712  61,977,178  18,014,430,787   
4  11,316,411,767  64,811,559  126,104,704  58,122,232  19,199,974,036   

       Rem_SL       Rem_Total  
0  24,714,834   2,499,016,167  
1   2,435,105  14,832,

In [31]:
# Train and Forecast using different ML models
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Forecasting and evaluation
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(f'Linear Regression MAE: {mae_lr}')


Linear Regression MAE: 944509513.0099945


In [32]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Forecasting and evaluation
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f'Random Forest MAE: {mae_rf}')


Random Forest MAE: 1062843588.125


In [33]:
# XGBoost
import xgboost as xgb

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Forecasting and evaluation
y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f'XGBoost MAE: {mae_xgb}')


XGBoost MAE: 1082867556.75


In [34]:
# Support Vector Regression
from sklearn.svm import SVR

# Support Vector Regression (SVR)
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# Forecasting and evaluation
y_pred_svr = svr_model.predict(X_test)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
print(f'SVR MAE: {mae_svr}')


SVR MAE: 2215249535.6819


In [35]:
# K-Nearest Neighbor
from sklearn.neighbors import KNeighborsRegressor

# k-Nearest Neighbors (kNN)
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Forecasting and evaluation
y_pred_knn = knn_model.predict(X_test)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
print(f'kNN MAE: {mae_knn}')


kNN MAE: 881047646.9499998


In [36]:
# Scale Dataset for SVR and kNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use X_train_scaled and X_test_scaled in SVR and kNN models
# Replace X_train_scaled and X_test_scaled with your scaled data
y_pred_svr = svr_model.predict(X_test_scaled)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
print(f'SVR MAE: {mae_svr}')

y_pred_knn = knn_model.predict(X_test_scaled)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
print(f'kNN MAE: {mae_knn}')


SVR MAE: 2215249539.5190535
kNN MAE: 3760801608.15




In [37]:
print(f'Comparison of MAE values:')
print(f'Linear Regression MAE: {mae_lr}')
print(f'Random Forest MAE: {mae_rf}')
print(f'XGBoost MAE: {mae_xgb}')
print(f'SVR MAE: {mae_svr}')
print(f'kNN MAE: {mae_knn}')

Comparison of MAE values:
Linear Regression MAE: 944509513.0099945
Random Forest MAE: 1062843588.125
XGBoost MAE: 1082867556.75
SVR MAE: 2215249539.5190535
kNN MAE: 3760801608.15


In [38]:
# Reload data to model using statistical models
# ARIMA and SARIMA
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load your dataset
file_path = '/content/sample_data/FDI_Remittance_5EnglishSpeaking_WAfricanCountries.csv'
data = pd.read_csv(file_path)

# Convert columns to numeric (remove commas)
data_cleaned = data.replace(',', '', regex=True).astype(float)

# Select the time series column to forecast (e.g., Rem_Ghana)
time_series = data_cleaned['Rem_Ghana']

# Split data into train and test sets
train_size = int(len(time_series) * 0.8)
train, test = time_series[:train_size], time_series[train_size:]


In [39]:
import statsmodels.api as sm

# ARIMA model
arima_model = sm.tsa.ARIMA(train, order=(5, 1, 0))  # (p=5, d=1, q=0)
arima_result = arima_model.fit()

# Forecast
arima_forecast = arima_result.forecast(steps=len(test))

# Calculate MAE
mae_arima = mean_absolute_error(test, arima_forecast)
print(f'ARIMA MAE: {mae_arima}')


ARIMA MAE: 732808277.1383209


  warn('Non-stationary starting autoregressive parameters'


In [40]:
# SARIMA model
sarima_model = sm.tsa.SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Adjust order and seasonal order
sarima_result = sarima_model.fit()

# Forecast
sarima_forecast = sarima_result.forecast(steps=len(test))

# Calculate MAE
mae_sarima = mean_absolute_error(test, sarima_forecast)
print(f'SARIMA MAE: {mae_sarima}')


  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'


SARIMA MAE: 730253912.3523346




In [41]:
print(f'Comparison of MAE values:')
print(f'Linear Regression MAE: {mae_lr}')
print(f'Random Forest MAE: {mae_rf}')
print(f'XGBoost MAE: {mae_xgb}')
print(f'SVR MAE: {mae_svr}')
print(f'kNN MAE: {mae_knn}')
print(f'ARIMA MAE: {mae_arima}')
print(f'SARIMA MAE: {mae_sarima}')

Comparison of MAE values:
Linear Regression MAE: 944509513.0099945
Random Forest MAE: 1062843588.125
XGBoost MAE: 1082867556.75
SVR MAE: 2215249539.5190535
kNN MAE: 3760801608.15
ARIMA MAE: 732808277.1383209
SARIMA MAE: 730253912.3523346


In [42]:
mae_df = pd.DataFrame(mae_results, columns=['Model', 'MAE'])
print(mae_df)

               Model           MAE
0              ARIMA  7.328083e+08
1             SARIMA  7.302539e+08
2  Linear Regression  9.445095e+08
3      Random Forest  1.062844e+09
4            XGBoost  1.099773e+09
5                SVR  2.215250e+09
6                kNN  8.810476e+08


In [27]:
!pip install ace_tools
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '/content/sample_data/FDI_Remittance_5EnglishSpeaking_WAfricanCountries.csv'
data = pd.read_csv(file_path)

# Convert columns to numeric (remove commas)
data_cleaned = data.replace(',', '', regex=True).astype(float)

# List of countries and columns for forecasting
countries = ['Ghana', 'Gambia', 'Liberia', 'Nigeria', 'SL']
results = []

# Function to prepare lagged features
def create_lagged_features(df, column, lags):
    for lag in range(1, lags + 1):
        df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    return df

# Function to calculate MAE for a given country
def calculate_mae(country):
    fdi_col = f'FDI_{country}'
    rem_col = f'Rem_{country}'

    # Create lagged features
    data_lagged = create_lagged_features(data_cleaned, fdi_col, 3)
    data_lagged = create_lagged_features(data_lagged, rem_col, 3)

    # Drop rows with missing values
    data_lagged.dropna(inplace=True)

    # Features (X) and Target (y)
    X = data_lagged[[f'{fdi_col}_lag_{i}' for i in range(1, 4)] + [f'{rem_col}_lag_{i}' for i in range(1, 4)]]
    y = data_lagged[rem_col]

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Initialize dictionary to store MAEs
    mae_results = {'Country': country}

    # ARIMA
    arima_model = sm.tsa.ARIMA(y_train, order=(5, 1, 0))
    arima_result = arima_model.fit()
    arima_forecast = arima_result.forecast(steps=len(y_test))
    mae_results['ARIMA'] = mean_absolute_error(y_test, arima_forecast)

    # SARIMA
    sarima_model = sm.tsa.SARIMAX(y_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    sarima_result = sarima_model.fit()
    sarima_forecast = sarima_result.forecast(steps=len(y_test))
    mae_results['SARIMA'] = mean_absolute_error(y_test, sarima_forecast)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    mae_results['Linear Regression'] = mean_absolute_error(y_test, y_pred_lr)

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    mae_results['Random Forest'] = mean_absolute_error(y_test, y_pred_rf)

    # XGBoost
    xgb_model = xgb.XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    mae_results['XGBoost'] = mean_absolute_error(y_test, y_pred_xgb)

    # SVR
    svr_model = SVR(kernel='rbf')
    svr_model.fit(X_train, y_train)
    y_pred_svr = svr_model.predict(X_test)
    mae_results['SVR'] = mean_absolute_error(y_test, y_pred_svr)

    # k-Nearest Neighbors (kNN)
    knn_model = KNeighborsRegressor(n_neighbors=3)
    knn_model.fit(X_train, y_train)
    y_pred_knn = knn_model.predict(X_test)
    mae_results['kNN'] = mean_absolute_error(y_test, y_pred_knn)

    return mae_results

# Loop through each country and calculate the MAEs
for country in countries:
    results.append(calculate_mae(country))

# Convert the results into a DataFrame and display
mae_df = pd.DataFrame(results)

# Display the MAE comparison table
print(mae_df)





  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dat

   Country         ARIMA        SARIMA  Linear Regression  Random Forest  \
0    Ghana  2.710306e+09  7.011557e+08       9.445095e+08   1.062844e+09   
1   Gambia  2.025717e+08  1.933299e+08       1.318399e+08   2.607201e+08   
2  Liberia  4.061435e+07  2.746446e+08       4.038323e+08   1.754494e+08   
3  Nigeria  4.908083e+09  2.597961e+09       1.113186e+10   7.384942e+08   
4       SL  8.949471e+07  8.925458e+07       8.550525e+07   1.196104e+08   

        XGBoost           SVR           kNN  
0  1.099773e+09  2.215250e+09  1.021856e+09  
1  2.225270e+08  3.467132e+08  2.724292e+08  
2  1.356000e+08  1.738875e+08  2.183250e+08  
3  2.034711e+09  2.231509e+09  1.794753e+09  
4  9.208040e+07  1.441115e+08  1.259796e+08  
