### This notebook focuses on cleaning and preprocessing the dataset to prepare it for analysis and modeling. The steps include handling missing values, encoding categorical variables, and scaling numerical features.

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [20]:
# load dataset
data_path = "../data/raw/global_renewable_energy_production.csv"
df = pd.read_csv(data_path)
df.head(),df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  240 non-null    int64  
 1   Country               240 non-null    object 
 2   SolarEnergy           240 non-null    float64
 3   WindEnergy            240 non-null    float64
 4   HydroEnergy           240 non-null    float64
 5   OtherRenewableEnergy  240 non-null    float64
 6   TotalRenewableEnergy  240 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 13.2+ KB


(   Year Country  SolarEnergy   WindEnergy  HydroEnergy  OtherRenewableEnergy  \
 0  2000     USA   437.086107  1435.928598  1544.389701            319.396318   
 1  2001     USA   240.416776   402.792876   398.742141            439.779266   
 2  2002     USA   641.003511  1120.494351   334.993640            486.459433   
 3  2003     USA   849.198377   476.040844   609.102444            132.532029   
 4  2004     USA   373.818019   882.183361  1034.306532            181.053113   
 
    TotalRenewableEnergy  
 0           3736.800724  
 1           1481.731059  
 2           2582.950935  
 3           2066.873694  
 4           2471.361025  ,
 None)

In [32]:
# checking for missing values
msv = df.isnull().sum()
print(f"Number of Missing values : {msv} \n")

Number of Missing values : Year                    0
Country                 0
SolarEnergy             0
WindEnergy              0
HydroEnergy             0
OtherRenewableEnergy    0
TotalRenewableEnergy    0
dtype: int64 



In [33]:
#checking for Duplicates
dup = df.duplicated().sum()
print(f"Number of Duplicate rows : {dup}")

Number of Duplicate rows : 0


In [35]:
# check Data types
print(f"Data Types :")
print(df.dtypes)

Data Types :
Year                      int64
Country                  object
SolarEnergy             float64
WindEnergy              float64
HydroEnergy             float64
OtherRenewableEnergy    float64
TotalRenewableEnergy    float64
dtype: object


In [37]:
# summray Statistics
print("Summray Statistics:")
print(df.describe())

Summray Statistics:
              Year  SolarEnergy   WindEnergy  HydroEnergy  \
count   240.000000   240.000000   240.000000   240.000000   
mean   2011.500000   528.523858   857.133260  1076.581975   
std       6.936653   271.183089   375.020314   499.981598   
min    2000.000000   104.555425   206.021630   320.662607   
25%    2005.750000   284.700505   523.572495   593.796081   
50%    2011.500000   533.436429   882.024084  1046.390380   
75%    2017.250000   766.701662  1160.199295  1495.160715   
max    2023.000000   996.973153  1487.070005  1983.858741   

       OtherRenewableEnergy  TotalRenewableEnergy  
count            240.000000            240.000000  
mean             287.127554           2749.366647  
std              128.460792            695.126957  
min               54.876943            910.381025  
25%              176.322725           2250.759951  
50%              291.398276           2815.458943  
75%              405.479393           3217.212712  
max           

In [39]:
# Encoding categorial variables
print("Unique countries in the dataset:")
print(df["Country"].unique())

Unique countries in the dataset:
['USA' 'China' 'India' 'Germany' 'UK' 'France' 'Brazil' 'Canada'
 'Australia' 'Japan']


In [65]:
# Create a ColumnTransformer for propressing 
numerical_features = ['SolarEnergy','WindEnergy','HydroEnergy','OtherRenewableEnergy',"TotalRenewableEnergy"]
categorical_features = ['Country']

In [66]:
# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

In [67]:
# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncode categorical features
])

In [68]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [69]:
# Apply preprocessing to the dataset
df_processed = preprocessor.fit_transform(df)

In [70]:
# Get column names
numerical_columns = numerical_features
categorical_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
all_columns = np.concatenate([numerical_columns, categorical_columns])


In [71]:
# Convert processed data to DataFrame
df_processed = pd.DataFrame(df_processed, columns=all_columns)

In [72]:
# Add 'Year' column back
df_processed['Year'] = df['Year'].values

In [73]:
# Display first few rows of processed DataFrame
df_processed.head()

Unnamed: 0,SolarEnergy,WindEnergy,HydroEnergy,OtherRenewableEnergy,TotalRenewableEnergy,Country_Australia,Country_Brazil,Country_Canada,Country_China,Country_France,Country_Germany,Country_India,Country_Japan,Country_UK,Country_USA,Year
0,-0.337886,1.546596,0.937605,0.25172,1.423478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2000
1,-1.064628,-1.214041,-1.358563,1.190797,-1.827414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2001
2,0.415641,0.703726,-1.486331,1.554937,-0.239904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2002
3,1.184973,-1.018315,-0.936947,-1.20596,-0.983877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2003
4,-0.571677,0.066936,-0.084731,-0.82746,-0.400771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2004


In [74]:
# Save the processed data to a new CSV file
processed_data_path = "../data/processed/cleaned_energy_data.csv"
df_processed.to_csv(processed_data_path, index=False)
print(f"Processed data saved to {processed_data_path}")

Processed data saved to ../data/processed/cleaned_energy_data.csv


In [75]:
print("""
Summary of Data Preprocessing:
1. Checked for missing values and duplicates (none found).
2. Filled missing values where necessary.
3. Encoded categorical variables using OneHotEncoding.
4. Scaled numerical features using StandardScaler.
5. Saved the processed data to a new CSV file.
""")


Summary of Data Preprocessing:
1. Checked for missing values and duplicates (none found).
2. Filled missing values where necessary.
3. Encoded categorical variables using OneHotEncoding.
4. Scaled numerical features using StandardScaler.
5. Saved the processed data to a new CSV file.

