In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet

In [2]:
file_path = 'IRENA_RenewableEnergy_Statistics_2000-2022.csv'

with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

df_irena = pd.read_csv(file_path, encoding=result['encoding'])

file_path_1 = 'organised_Gen.csv'

with open(file_path_1, 'rb') as f:
    result = chardet.detect(f.read())

df_us_data = pd.read_csv(file_path_1, encoding=result['encoding'])

file_path_2 = '02 modern-renewable-energy-consumption.csv'

with open(file_path_2, 'rb') as f:
    result = chardet.detect(f.read())

df_world_data = pd.read_csv(file_path_2, encoding=result['encoding'])

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [29]:
df = df_irena.dropna(subset=['Electricity Generation (GWh)'])

y = df['Electricity Generation (GWh)']
X = df.drop(columns=['Electricity Generation (GWh)', 'Electricity Installed Capacity (MW)'])

categorical_cols = ['Region', 'Sub-region', 'Country', 'RE or Non-RE', 'Group Technology', 'Technology', 'Producer Type']
numerical_cols = ['Year']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5)
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred),
        'Model': pipeline
    }

results_df = pd.DataFrame({
    model: {
        'RMSE': round(metrics['RMSE'], 2),
        'R2': round(metrics['R2'], 4)
    } for model, metrics in results.items()
})


  model = cd_fast.sparse_enet_coordinate_descent(


In [30]:
results_df

Unnamed: 0,Ridge,Lasso,ElasticNet
RMSE,137616.78,137608.09,148136.42
R2,0.1586,0.1587,0.025


In [32]:
preprocessor = pipeline.named_steps['preprocessor']

cat = preprocessor.named_transformers_['cat']
encoded_cat_features = cat.get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(encoded_cat_features)

In [36]:
coefs = pipeline.named_steps['regressor'].coef_

coef_df = pd.Series(coefs, index=all_features).sort_values(key=abs, ascending=False)
coef_df.head(10)

Technology_Coal and peat                  6405.716291
Sub-region_Eastern Asia                   5027.537306
Sub-region_Northern America               4910.134703
Country_United States of America (the)    4790.499689
Technology_Oil                           -4661.517155
RE or Non-RE_Total Non-Renewable          4559.012542
RE or Non-RE_Total Renewable             -4559.008672
Country_China                             4290.917111
Group Technology_Fossil fuels             4013.805721
Region_Asia                               3318.726788
dtype: float64