## TSF - Hackathon (v7)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the training and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')  # Load the sample submission file

# Convert the 'Month' column to datetime format for both train and test data
train_data['Month'] = pd.to_datetime(train_data['Month'], format='%m-%d-%Y')
test_data['Month'] = pd.to_datetime(test_data['Month'], format='%m-%d-%Y')
sample_submission['Month'] = pd.to_datetime(sample_submission['Month'], format='%m-%d-%Y')

# Extract month and year as separate features in both train and test datasets
train_data['Year'] = train_data['Month'].dt.year
train_data['Month_Num'] = train_data['Month'].dt.month
test_data['Year'] = test_data['Month'].dt.year
test_data['Month_Num'] = test_data['Month'].dt.month

# Create additional time-based features in the training data
train_data['Quarter'] = train_data['Month'].dt.quarter
train_data['Day_of_Year'] = train_data['Month'].dt.dayofyear
train_data['Is_Year_Start'] = train_data['Month'].dt.is_year_start.astype(int)
train_data['Is_Year_End'] = train_data['Month'].dt.is_year_end.astype(int)

# Create rolling and lag features for capturing trends and seasonality
train_data['Rolling_Mean_3'] = train_data['Avg_sunspot_count'].rolling(window=3).mean()
train_data['Rolling_Mean_6'] = train_data['Avg_sunspot_count'].rolling(window=6).mean()
train_data['Rolling_Mean_12'] = train_data['Avg_sunspot_count'].rolling(window=12).mean()
train_data['Rolling_Std_3'] = train_data['Avg_sunspot_count'].rolling(window=3).std()

# Create lag features for the model (e.g., previous 12 months' sunspot counts)
for lag in range(1, 13):
    train_data[f'lag_{lag}'] = train_data['Avg_sunspot_count'].shift(lag)

# Drop rows with missing values introduced by lag and rolling features
train_data = train_data.dropna()

# Prepare features (X) and target variable (y)
X = train_data.drop(['Month', 'Avg_sunspot_count'], axis=1)
y = train_data['Avg_sunspot_count']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LightGBM regressor model
lgb_model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=1000)

# Fit the model with training data
lgb_model.fit(X_train, y_train)

# Prepare the test dataset by creating the same additional features and lag features
test_data['Quarter'] = test_data['Month'].dt.quarter
test_data['Day_of_Year'] = test_data['Month'].dt.dayofyear
test_data['Is_Year_Start'] = test_data['Month'].dt.is_year_start.astype(int)
test_data['Is_Year_End'] = test_data['Month'].dt.is_year_end.astype(int)

# Use the last 12 months of training data to create rolling and lag features in the test set
test_data['Rolling_Mean_3'] = test_data['Month'].map(lambda x: train_data[train_data['Month'] < x]['Avg_sunspot_count'].tail(3).mean())
test_data['Rolling_Mean_6'] = test_data['Month'].map(lambda x: train_data[train_data['Month'] < x]['Avg_sunspot_count'].tail(6).mean())
test_data['Rolling_Mean_12'] = test_data['Month'].map(lambda x: train_data[train_data['Month'] < x]['Avg_sunspot_count'].tail(12).mean())

# Create lag features in the test set using the last available training data
for lag in range(1, 13):
    test_data[f'lag_{lag}'] = test_data['Month'].map(lambda x: train_data[train_data['Month'] < x]['Avg_sunspot_count'].tail(lag).iloc[0])

# Align the columns of test data to match the training data
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0  # Add any missing columns with default values

# Ensure the order of columns in test data matches the training data
test_data = test_data[X.columns]

# Check the number of rows in the test data and sample submission
print(f"Test Data Rows: {len(test_data)}")
print(f"Sample Submission Rows: {len(sample_submission)}")

# Generate predictions for the test set using the trained LightGBM model
predictions = lgb_model.predict(test_data)

# Ensure the number of predictions matches the sample submission rows
if len(predictions) == len(sample_submission):
    # Merge predictions with the sample submission file to ensure identical structure
    final_submission = sample_submission[['Month']].copy()  # Copy only the 'Month' column from the sample submission
    final_submission['Avg_sunspot_count'] = predictions

    # Save the final submission file
    submission_file_path = 'lgbm_sunspot_forecast_submission.csv'
    final_submission.to_csv(submission_file_path, index=False)

    # Print the path of the submission file
    print(f"Submission file saved at: {submission_file_path}")
else:
    print(f"Error: Number of predictions ({len(predictions)}) does not match number of rows in sample submission ({len(sample_submission)})")

# Calculate RMSE on the validation set to evaluate model performance
valid_predictions = lgb_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
print(f"Validation RMSE of the LightGBM model: {rmse}")

# Optional: Plot actual vs predicted values for the validation set
plt.figure(figsize=(12, 6))
plt.plot(y_valid.values, label='Actual', color='blue')
plt.plot(valid_predictions, label='Predicted', color='orange')
plt.xlabel('Time')
plt.ylabel('Average Sunspot Count')
plt.title('Actual vs Predicted Values (Validation Set)')
plt.legend()
plt.show()
