In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet

In [4]:
file_path = 'IRENA_RenewableEnergy_Statistics_2000-2022.csv'

with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

df_irena = pd.read_csv(file_path, encoding=result['encoding'])

file_path_1 = 'organised_Gen.csv'

with open(file_path_1, 'rb') as f:
    result = chardet.detect(f.read())

df_us_data = pd.read_csv(file_path_1, encoding=result['encoding'])

file_path_2 = '02 modern-renewable-energy-consumption.csv'

with open(file_path_2, 'rb') as f:
    result = chardet.detect(f.read())

df_world_data = pd.read_csv(file_path_2, encoding=result['encoding'])

In [5]:
df_world_filtered = df_world_data[df_world_data['Entity'] != 'World']
df_countries = df_world_filtered[df_world_filtered['Code'].notna()]

In [6]:
df_countries.describe()

Unnamed: 0,Year,Geo Biomass Other - TWh,Solar Generation - TWh,Wind Generation - TWh,Hydro Generation - TWh
count,4242.0,4173.0,4173.0,4173.0,4235.0
mean,1994.10396,2.754773,1.133695,3.091469,30.258398
std,16.243789,9.494644,10.074546,22.486682,84.752667
min,1965.0,0.0,0.0,0.0,0.0
25%,1980.0,0.0,0.0,0.0,0.41
50%,1995.0,0.038,0.0,0.0,4.517
75%,2008.0,1.210555,0.00527,0.062,21.65819
max,2021.0,169.93158,327.0,655.6,1321.709


In [7]:
df_countries['High_Solar'] = (df_countries['Solar Generation - TWh'] > 5).astype(int)

feature_cols = ['Geo Biomass Other - TWh', 'Wind Generation - TWh', 'Hydro Generation - TWh']
df_model = df_countries.dropna(subset=feature_cols + ['High_Solar'])

X = df_model[feature_cols]
y = df_model['High_Solar']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=False)

print(model.coef_)
print(model.intercept_)

[[ 0.76170765  2.83650004 -1.32446809]]
[-4.19424145]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_countries['High_Solar'] = (df_countries['Solar Generation - TWh'] > 5).astype(int)


In [8]:
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       802
           1       0.89      0.50      0.64        32

    accuracy                           0.98       834
   macro avg       0.93      0.75      0.81       834
weighted avg       0.98      0.98      0.98       834



In [9]:
from sklearn.utils import resample

df_combined = df_model[feature_cols + ['High_Solar']]

df_majority = df_combined[df_combined.High_Solar == 0]
df_minority = df_combined[df_combined.High_Solar == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

X_up = df_upsampled[feature_cols]
y_up = df_upsampled['High_Solar']

X_up_scaled = scaler.fit_transform(X_up)

X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_up_scaled, y_up, test_size=0.2, random_state=42)

model_up = LogisticRegression()
model_up.fit(X_train_up, y_train_up)

y_pred_up = model_up.predict(X_test_up)
report_up = classification_report(y_test_up, y_pred_up, output_dict=False)

print(model_up.coef_)
print(model_up.intercept_)

[[ 3.44507234 14.33727659 -3.07089646]]
[4.91482229]


In [10]:
print(report_up)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93       795
           1       0.98      0.88      0.93       816

    accuracy                           0.93      1611
   macro avg       0.94      0.93      0.93      1611
weighted avg       0.94      0.93      0.93      1611

