In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chardet

In [4]:
file_path = 'IRENA_RenewableEnergy_Statistics_2000-2022.csv'

with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

df_irena = pd.read_csv(file_path, encoding=result['encoding'])

file_path_1 = 'organised_Gen.csv'

with open(file_path_1, 'rb') as f:
    result = chardet.detect(f.read())

df_us_data = pd.read_csv(file_path_1, encoding=result['encoding'])

file_path_2 = '02 modern-renewable-energy-consumption.csv'

with open(file_path_2, 'rb') as f:
    result = chardet.detect(f.read())

df_world_data = pd.read_csv(file_path_2, encoding=result['encoding'])

In [6]:
df_ustotal = df_us_data[df_us_data['ENERGY SOURCE'] == 'Total'].reset_index(drop=True)
df_ustotal

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
0,5,2001,1,AK,Total Electric Power Industry,Total,590145.0
1,11,2001,1,AK,"Electric Generators, Electric Utilities",Total,493206.0
2,14,2001,1,AK,"Combined Heat and Power, Electric Power",Total,19955.0
3,17,2001,1,AK,"Combined Heat and Power, Commercial Power",Total,9824.0
4,20,2001,1,AK,"Combined Heat and Power, Industrial Power",Total,67160.0
...,...,...,...,...,...,...,...
73351,10554,2022,5,WV,"Electric Generators, Electric Utilities",Total,3085869.0
73352,10559,2022,5,WY,Total Electric Power Industry,Total,3276247.0
73353,10568,2022,5,WY,"Combined Heat and Power, Industrial Power",Total,106316.0
73354,10574,2022,5,WY,"Electric Generators, Independent Power Producers",Total,455442.0


In [10]:
X = df_ustotal[["YEAR", "MONTH", "STATE", "TYPE OF PRODUCER"]]
y = df_ustotal["GENERATION (Megawatthours)"]

numeric_features = ["YEAR", "MONTH"]
categorical_features = ["STATE", "TYPE OF PRODUCER"]
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features),
    ]
)

pcr_pipeline = make_pipeline(
    preprocessor,
    PCA(n_components=10),
    LinearRegression()
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pcr_pipeline.fit(X_train, y_train)

y_pred = pcr_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

r2, rmse

(0.040887531675707356, np.float64(24251453.683595702))