In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

# Step 1: Fetch the weather dataset from OpenML
weather = fetch_openml(data_id=45104, as_frame=True)  # Daily Weather Dataset

# Load the dataframe
data = weather.frame

# Step 2: Identify temperature column
# Print columns to understand dataset structure
print("Dataset Columns:\n", data.columns)

# Common temperature column names (adjust based on dataset)
temp_candidates = ['Temperature', 'Temp', 'Tmax', 'temperature', 'temp']

# Select the first temperature-related column found
temp_col = None
for col in temp_candidates:
    if col in data.columns:
        temp_col = col
        break

# Fallback: if no temperature column found, pick the first numeric column
if temp_col is None:
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        temp_col = numeric_cols[0]
    else:
        raise ValueError("No suitable temperature or numeric column found.")

print(f"Using temperature column: {temp_col}")

# Step 3: Prepare the dataset
# If 'Date' column exists, convert it to datetime and sort, else create dummy date
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    data = data.sort_values('Date').reset_index(drop=True)
else:
    data = data.reset_index(drop=True)
    data['Date'] = pd.date_range(start='2000-01-01', periods=len(data))

# Step 4: Feature engineering
data['Temp_Lag1'] = data[temp_col].shift(1)
data['Temp_Lag7'] = data[temp_col].shift(7)
data['MA_7'] = data[temp_col].rolling(window=7).mean()
data['MA_30'] = data[temp_col].rolling(window=30).mean()

# Target variable: next day's temperature
data['Target'] = data[temp_col].shift(-1)

# Remove rows with NaN values caused by shifts and rolling windows
data = data.dropna().reset_index(drop=True)

# Step 5: Define features and target for model training
features = [temp_col, 'Temp_Lag1', 'Temp_Lag7', 'MA_7', 'MA_30']
X = data[features]
y = data['Target']

# Step 6: Split data into train and test sets (no shuffle to preserve time series order)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Step 7: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Predict on test data
predictions = model.predict(X_test)

# Step 9: Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

# Step 10: Plot actual vs predicted temperature
plt.figure(figsize=(14,6))
plt.plot(y_test.index, y_test, label='Actual Temperature', color='blue')
plt.plot(y_test.index, predictions, label='Predicted Temperature', linestyle='--', color='red')
plt.xlabel('Sample Index')
plt.ylabel('Temperature')
plt.title('Actual vs Predicted Temperature')
plt.legend()
plt.grid(True)
plt.show()


Dataset Columns:
 Index(['FILE_NAME', 'CATEGORY', 'SUPER_CATEGORY'], dtype='object')
Using temperature column: SUPER_CATEGORY


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.