In [37]:
#Libraries for data manipulation and visuallization
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

#Libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Libraries for models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

#Libraries for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("5G_energy_consumption_dataset.csv")
df.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


In [7]:
df["BS"].unique()

array(['B_0', 'B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8',
       'B_9', 'B_10', 'B_11', 'B_12', 'B_14', 'B_15', 'B_16', 'B_17',
       'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25',
       'B_26', 'B_27', 'B_28', 'B_29', 'B_30', 'B_31', 'B_32', 'B_33',
       'B_34', 'B_35', 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41',
       'B_42', 'B_43', 'B_44', 'B_45', 'B_46', 'B_47', 'B_48', 'B_49',
       'B_50', 'B_51', 'B_52', 'B_53', 'B_54', 'B_55', 'B_56', 'B_57',
       'B_58', 'B_59', 'B_60', 'B_61', 'B_62', 'B_63', 'B_64', 'B_65',
       'B_66', 'B_67', 'B_68', 'B_69', 'B_70', 'B_71', 'B_72', 'B_73',
       'B_74', 'B_75', 'B_76', 'B_77', 'B_78', 'B_79', 'B_80', 'B_81',
       'B_82', 'B_83', 'B_84', 'B_85', 'B_86', 'B_87', 'B_88', 'B_89',
       'B_90', 'B_91', 'B_92', 'B_93', 'B_94', 'B_95', 'B_96', 'B_97',
       'B_98', 'B_99', 'B_100', 'B_101', 'B_102', 'B_103', 'B_104',
       'B_105', 'B_106', 'B_107', 'B_108', 'B_109', 'B_110', 'B_111',
       'B_11

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


In [3]:
report = ProfileReport(df, title="Energy Report")

In [4]:
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:00<00:00, 28.45it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
df.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

In [23]:
df.duplicated().sum()

0

In [8]:
#Instantiate the encoder
encoder = LabelEncoder()

In [9]:
df["BS"] = encoder.fit_transform(df["BS"])

In [10]:
df.head(2)

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,0,55.904335,0.344468,0.0,7.101719


In [13]:
df.drop("Time", axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,BS,Energy,load,ESMODE,TXpower
0,0,64.275037,0.487936,0.0,7.101719
1,0,55.904335,0.344468,0.0,7.101719
2,0,57.698057,0.193766,0.0,7.101719
3,0,55.156951,0.222383,0.0,7.101719
4,0,56.053812,0.175436,0.0,7.101719


In [15]:
features = df.drop("Energy", axis=1)
labels = df["Energy"]

In [17]:
features.head(), labels.head()

(   BS      load  ESMODE   TXpower
 0   0  0.487936     0.0  7.101719
 1   0  0.344468     0.0  7.101719
 2   0  0.193766     0.0  7.101719
 3   0  0.222383     0.0  7.101719
 4   0  0.175436     0.0  7.101719,
 0    64.275037
 1    55.904335
 2    57.698057
 3    55.156951
 4    56.053812
 Name: Energy, dtype: float64)

### in order to be able to evaluate the model's performance later , we need to spilt the dataset into the ratio of 80/20. 80% for training set and 20% for testing set. in this case we are going to take 80% of 92629 which is equal = 74153 and the remaining 18538 for testing.

In [21]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

In [27]:
y_test

88868    43.348281
34761    11.958146
70197    19.133034
54555    30.044843
20789    46.786248
           ...    
55474    26.307922
70529    20.926756
5344     19.581465
12286    42.600897
87357    20.179372
Name: Energy, Length: 18526, dtype: float64

In [28]:
x_train.shape, x_test.shape

((74103, 4), (18526, 4))

In [48]:
x_train

Unnamed: 0,BS,load,ESMODE,TXpower
37876,276,0.035120,0.0,6.427504
83222,729,0.061460,0.0,6.427504
5469,448,0.607400,0.0,6.875934
3102,205,0.741180,0.0,7.100897
63522,533,0.045120,0.0,6.427504
...,...,...,...,...
21243,109,0.481610,0.0,6.875934
45891,357,0.268043,0.0,6.875934
42613,323,0.369820,0.0,6.728027
43567,333,0.708830,0.0,6.875934


### MODELING 
instantiate the model

In [38]:
rf = RandomForestRegressor()

###Train the model on the training set. x_train, y_train

In [39]:
rf.fit(x_train, y_train)

In [35]:
x_test

Unnamed: 0,BS,load,ESMODE,TXpower
88868,786,0.09967,0.0,6.651719
34761,246,0.01341,0.0,6.875934
70197,599,0.19890,0.0,6.875934
54555,444,0.00834,0.0,6.875934
20789,104,0.03129,0.0,7.101719
...,...,...,...,...
55474,454,0.05432,0.0,6.427504
70529,602,0.13811,0.0,6.875934
5344,436,0.12532,0.0,6.875934
12286,21,0.82421,0.0,6.727205


In [36]:
y_test

88868    43.348281
34761    11.958146
70197    19.133034
54555    30.044843
20789    46.786248
           ...    
55474    26.307922
70529    20.926756
5344     19.581465
12286    42.600897
87357    20.179372
Name: Energy, Length: 18526, dtype: float64

In [40]:
y_pred = rf.predict(x_test)
y_pred

array([44.01046338, 12.00597908, 21.68460389, ..., 22.97309417,
       43.17488789, 20.47234679])

### Model Evaluation

In [41]:
mae = mean_absolute_error(y_test, y_pred)

print(f"The mean absolute error of the model is {mae}")

The mean absolute error of the model is 2.026094711497276


In [42]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"The mean squared error of the model is {mse}")
print(f"The root mean squared error of the model is {rmse}")

The mean squared error of the model is 10.43100836223062
The root mean squared error of the model is 3.2297071635413976


### Ways to Improve the Model's performance
1. By Fine tuning the model parameters
2. By using a more sophisticated regression model
3. By using feature engineering to engineer new features