In [None]:
import pandas as pd
import numpy as np
import boto3
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 1. Load CSV directly from S3
bucket = 'harperkatesbucket'
key = 'nasa_power_data.csv'
s3_uri = f's3://{bucket}/{key}'

df = pd.read_csv(s3_uri)  # Assumes SageMaker IAM role has access to the bucket

# 2. Create target: next-day radiation
nextday = df[["ALLSKY_SFC_SW_DWN"]][1:].reset_index(drop=True).rename(columns={"ALLSKY_SFC_SW_DWN": "rad_nd"})
datam = pd.concat([df, nextday], axis=1).iloc[:-1]

# 3. Time-based train/test split
X_train = datam[["ALLSKY_SFC_SW_DWN", "PRECTOTCORR"]][:12419]
X_test = datam[["ALLSKY_SFC_SW_DWN", "PRECTOTCORR"]][12419:]
y_train = datam[["rad_nd"]][:12419]
y_test = datam[["rad_nd"]][12419:]

# 4. Fit linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

# 5. Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

# 6. Plot predictions
date_index = pd.date_range(start='2018-01-01', periods=len(y_test), freq='D')
y_test.index = date_index
y_pred_series = pd.Series(y_pred.flatten(), index=date_index)

plt.figure(figsize=(12, 4))
plt.plot(y_test, label='True', color='blue')
plt.plot(y_pred_series, label='Predicted', color='red', alpha=0.6)
plt.title('Predicted vs. Actual Radiation from Linear Regression')
plt.xlabel('Date')
plt.ylabel('Radiation (MJ/m²/day)')
plt.legend()
plt.show()
