### 1i) Import the Data and Perform Basic Data Exploration

In [22]:
import pandas as pd


df = pd.read_csv('5G_energy_consumption_dataset.csv')

print(df.head())

print(df.info())

print(df.describe())


              Time   BS     Energy      load  ESMODE   TXpower
0  20230101 010000  B_0  64.275037  0.487936     0.0  7.101719
1  20230101 020000  B_0  55.904335  0.344468     0.0  7.101719
2  20230101 030000  B_0  57.698057  0.193766     0.0  7.101719
3  20230101 040000  B_0  55.156951  0.222383     0.0  7.101719
4  20230101 050000  B_0  56.053812  0.175436     0.0  7.101719
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB
None
             Energy          load        ESMODE       TXpower
count  92629.000000  92629.000000  92629.000000  92629.000000
mean      28.138997      0.24

### 1b) Create a Pandas Profiling Report
##### pip install pandas-profiling
##### if you encounter error try this
- pip install --upgrade numba pandas-profiling visions

In [23]:
# import pandas_profiling as pp

# # Create a profile report
# profile = pp.ProfileReport(df, title='Pandas Profiling Report', explorative=True)
# profile.to_file("data_profile.html")


In [24]:

import sweetviz as sv

report = sv.analyze(df)

report.show_html('sweetviz_report.html')

                                             |          | [  0%]   00:00 -> (? left)

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### 1c) Handle Missing and Corrupted Values

In [25]:
print(df.isnull().sum())

df.dropna()

# Checking if any value in 'Energy' column is negative
print(df[df['Energy'] < 0])

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64
Empty DataFrame
Columns: [Time, BS, Energy, load, ESMODE, TXpower]
Index: []


### 1d) Remove Duplicates:

In [26]:
df.drop_duplicates(inplace=True)

print(df.duplicated().sum())

0


### 1e) Handle Outliers:

In [27]:
from scipy import stats

df['z_score_energy'] = stats.zscore(df['Energy'])

df = df[df['z_score_energy'].abs() <= 3]

df.drop(columns=['z_score_energy'], inplace=True)

### 1f) Encode Categorical Features

In [28]:
# Encoding 'BS' column
df = pd.get_dummies(df, columns=['BS'], drop_first=True)


              Time     Energy      load  ESMODE   TXpower  BS_B_1  BS_B_10  \
0  20230101 010000  64.275037  0.487936     0.0  7.101719   False    False   
1  20230101 020000  55.904335  0.344468     0.0  7.101719   False    False   
2  20230101 030000  57.698057  0.193766     0.0  7.101719   False    False   
3  20230101 040000  55.156951  0.222383     0.0  7.101719   False    False   
4  20230101 050000  56.053812  0.175436     0.0  7.101719   False    False   

   BS_B_100  BS_B_1003  BS_B_1004  ...  BS_B_990  BS_B_991  BS_B_992  \
0     False      False      False  ...     False     False     False   
1     False      False      False  ...     False     False     False   
2     False      False      False  ...     False     False     False   
3     False      False      False  ...     False     False     False   
4     False      False      False  ...     False     False     False   

   BS_B_993  BS_B_994  BS_B_995  BS_B_996  BS_B_997  BS_B_998  BS_B_999  
0     False     False   

### 2) Select Target Variable and Features:

In [29]:

X = df.drop(columns=['Energy',"Time"])
y = df['Energy']

### 3) Split the Dataset into Training and Test Sets

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 4) Select a Regression Algorithm and Train the Model

In [31]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

### 5) Assess Model Performance

In [32]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 9.467672783418234
R^2 Score: 0.945325404252824


### 6) Discuss Alternative Ways to Improve Model Performance:
- **Feature Engineering**: Create new features or transform existing ones.
- **Feature Selection**: Use techniques to select the most relevant features.
- **Hyperparameter Tuning**: Adjust model parameters for better performance.
- **Different Algorithms**: Try other regression algorithms like Random Forest or Gradient Boosting.
- **Cross-Validation**: Use cross-validation to ensure your model is robust.