In [6]:
import pandas as pd 
import numpy as np 

In [7]:
df = pd.read_csv("./Data/revenue_forecasting_monthly_dummy_data.csv", parse_dates=["Date"])
df.head(10)

Unnamed: 0,Date,Revenue,Exchange_Rate_NGN_USD,Inflation_Rate_Percent
0,2000-01-31,524835.707651,414.049593,12.471229
1,2000-02-29,493086.784941,368.865024,13.54173
2,2000-03-31,532384.426905,389.593887,9.042828
3,2000-04-30,576151.49282,375.349953,14.287508
4,2000-05-31,488292.331264,370.531762,12.676993
5,2000-06-30,488293.152153,442.480105,11.169424
6,2000-07-31,578960.640775,417.850774,13.265564
7,2000-08-31,538371.736458,365.35452,16.541386
8,2000-09-30,476526.280703,444.979994,12.363733
9,2000-10-31,527128.002179,415.364976,12.496441


In [8]:
df.shape

(288, 4)

In [9]:
df.isna().sum()

Date                      0
Revenue                   0
Exchange_Rate_NGN_USD     0
Inflation_Rate_Percent    0
dtype: int64

In [10]:
df.dtypes

Date                      datetime64[ns]
Revenue                          float64
Exchange_Rate_NGN_USD            float64
Inflation_Rate_Percent           float64
dtype: object

In [11]:
df.describe()

Unnamed: 0,Date,Revenue,Exchange_Rate_NGN_USD,Inflation_Rate_Percent
count,288,288.0,288.0,288.0
mean,2012-01-14 21:30:00,499445.23082,398.980104,12.083752
min,2000-01-31 00:00:00,337936.632997,300.0,6.606227
25%,2006-01-23 06:00:00,464533.002314,365.269321,10.830784
50%,2012-01-15 12:00:00,502594.513959,399.002055,12.054409
75%,2018-01-07 18:00:00,530758.702679,431.15837,13.370354
max,2023-12-31 00:00:00,692636.574533,553.94404,17.264764
std,,49843.446944,47.570196,1.972442


## Feature Engineering

In [12]:
#Lag Features 

df['Revenue_Y-1'] = df['Revenue'].shift(12)
df['Exchange_Rate_Y-1'] = df['Exchange_Rate_NGN_USD'].shift(12)
df['Inflation_Rate_Y-1'] = df['Inflation_Rate_Percent'].shift(12)

#Revenue Growth % compared to same month previous year
df['Revenue_Growth(%)'] = ((df['Revenue'] - df['Revenue_Y-1']) / df['Revenue_Y-1']) * 100

In [13]:
#Dealing with date columns 

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter

In [14]:
# Drop the original Date column
df = df.drop(columns=['Date'])

In [15]:
df.head(20)

Unnamed: 0,Revenue,Exchange_Rate_NGN_USD,Inflation_Rate_Percent,Revenue_Y-1,Exchange_Rate_Y-1,Inflation_Rate_Y-1,Revenue_Growth(%),Year,Month,Quarter
0,524835.707651,414.049593,12.471229,,,,,2000,1,1
1,493086.784941,368.865024,13.54173,,,,,2000,2,1
2,532384.426905,389.593887,9.042828,,,,,2000,3,1
3,576151.49282,375.349953,14.287508,,,,,2000,4,2
4,488292.331264,370.531762,12.676993,,,,,2000,5,2
5,488293.152153,442.480105,11.169424,,,,,2000,6,2
6,578960.640775,417.850774,13.265564,,,,,2000,7,3
7,538371.736458,365.35452,16.541386,,,,,2000,8,3
8,476526.280703,444.979994,12.363733,,,,,2000,9,3
9,527128.002179,415.364976,12.496441,,,,,2000,10,4


In [16]:
df = df.dropna().reset_index(drop=True)

In [17]:
df.head(10)

Unnamed: 0,Revenue,Exchange_Rate_NGN_USD,Inflation_Rate_Percent,Revenue_Y-1,Exchange_Rate_Y-1,Inflation_Rate_Y-1,Revenue_Growth(%),Year,Month,Quarter
0,512098.113578,358.550249,13.660672,524835.707651,414.049593,12.471229,-2.426968,2001,1,1
1,404335.987767,371.990948,10.287832,493086.784941,368.865024,13.54173,-17.999022,2001,2,1
2,413754.108374,437.36468,12.143132,532384.426905,389.593887,9.042828,-22.28283,2001,3,1
3,471885.623538,430.518513,11.044685,576151.49282,375.349953,14.287508,-18.096954,2001,4,2
4,449358.443983,398.95492,12.95796,488292.331264,370.531762,12.676993,-7.973479,2001,5,2
5,515712.36663,405.866369,12.667324,488293.152153,442.480105,11.169424,5.615318,2001,6,2
6,454598.796224,463.883245,14.07508,578960.640775,417.850774,13.265564,-21.48019,2001,7,3
7,429384.814933,370.421431,10.979967,538371.736458,365.35452,16.541386,-20.243804,2001,8,3
8,573282.438446,427.354869,11.46025,476526.280703,444.979994,12.363733,20.304475,2001,9,3
9,488711.184976,389.890367,10.042473,527128.002179,415.364976,12.496441,-7.287948,2001,10,4


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [19]:
X = df.drop(columns=['Revenue']) #
y = df['Revenue'] #target column 

In [20]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 138452734.29494315
R-squared: 0.9118246953711001


In [22]:
X_train

Unnamed: 0,Exchange_Rate_NGN_USD,Inflation_Rate_Percent,Revenue_Y-1,Exchange_Rate_Y-1,Inflation_Rate_Y-1,Revenue_Growth(%),Year,Month,Quarter
258,419.722607,12.578337,522190.971407,428.853606,11.930630,9.550662,2022,7,3
232,396.696010,10.965423,502278.591995,316.529736,14.511512,-12.402164,2020,5,2
33,366.141914,12.942937,447114.453552,429.757851,14.992089,3.778302,2003,10,4
157,348.237884,10.548512,523691.646046,433.483627,8.594833,-11.720342,2014,2,1
148,374.306654,12.512059,526097.078281,481.430777,12.341731,-14.223831,2013,5,2
...,...,...,...,...,...,...,...,...,...
188,484.357082,10.545726,457660.314097,428.827848,14.298547,13.160209,2016,9,3
71,424.899915,11.711279,576901.828323,358.638453,11.891410,-17.821982,2006,12,4
106,346.618979,13.186203,594309.295061,458.972006,10.950959,-6.254009,2009,11,4
270,400.920919,10.895554,572063.664453,419.722607,12.578337,1.265100,2023,7,3
