In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor
import torch
import torch.nn as nn
import torch.optim as optim

# RMSLE function
def calculate_rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv')
holidays = pd.read_csv('holidays_events.csv')
transactions = pd.read_csv('transactions.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Preprocessing - Keep only data from 2016 onwards
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
train = train[train['date'] >= '2016-01-01']

# Adding features for year, month, day, and holidays
train['holiday'] = train['date'].isin(holidays['date'])
test['holiday'] = test['date'] == pd.to_datetime('2017-08-24')
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

# Drop date for training and test sets
train = train.drop(columns=['date'])
test = test.drop(columns=['date'])

# One-hot encoding for categorical columns
object_cols = train.select_dtypes(include=['object']).columns
train = pd.get_dummies(train, columns=object_cols, drop_first=True)
test = pd.get_dummies(test, columns=object_cols, drop_first=True)

# Align train and test sets
train, test = train.align(test, join='left', axis=1, fill_value=0)

# Features and target setup
X = train.drop(columns=['sales'])
y = train['sales']

# Log transformation of the target to reduce the effect of outliers
y_log = np.log1p(y)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))

# Reshape data for LSTM input
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_val_scaled = X_val_scaled.reshape(X_val_scaled.shape[0], 1, X_val_scaled.shape[1])

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32)

# Define the LSTM model in PyTorch
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# LSTM Model parameters
input_size = X_train_tensor.shape[2]
hidden_size = 50
num_layers = 2
output_size = 1

# Instantiate the LSTM model, define loss and optimizer
lstm_model = LSTMModel(input_size, hidden_size, num_layers, output_size)
loss_function = nn.MSELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Train the LSTM model
num_epochs = 50
for epoch in range(num_epochs):
    lstm_model.train()
    optimizer.zero_grad()
    y_train_pred = lstm_model(X_train_tensor)
    loss = loss_function(y_train_pred, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation step
    lstm_model.eval()
    with torch.no_grad():
        y_val_pred = lstm_model(X_val_tensor)
        val_loss = loss_function(y_val_pred, y_val_tensor)
        y_val_pred_inverse = scaler_y.inverse_transform(y_val_pred.numpy())
        y_val_inverse = scaler_y.inverse_transform(y_val_tensor.numpy())
        val_rmsle = calculate_rmsle(np.expm1(y_val_inverse), np.expm1(y_val_pred_inverse))
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation RMSLE: {val_rmsle}')

# Extract features from LSTM
with torch.no_grad():
    lstm_train_features = lstm_model(X_train_tensor).numpy()
    lstm_val_features = lstm_model(X_val_tensor).numpy()

# Combine LSTM features with original dataset
train_lstm_df = pd.DataFrame(lstm_train_features, columns=[f'lstm_feat_{i}' for i in range(lstm_train_features.shape[1])])
val_lstm_df = pd.DataFrame(lstm_val_features, columns=[f'lstm_feat_{i}' for i in range(lstm_val_features.shape[1])])

X_train_combined = pd.concat([X_train.reset_index(drop=True), train_lstm_df], axis=1)
X_val_combined = pd.concat([X_val.reset_index(drop=True), val_lstm_df], axis=1)

# Set up CatBoost model with fixed parameters (iterations=1000, depth=8, learning_rate=0.1)
catboost_model = CatBoostRegressor(
    iterations=1000,
    depth=8,
    learning_rate=0.1,
    loss_function='RMSE',
    verbose=100
)

# Train CatBoost model
catboost_model.fit(X_train_combined, y_train, eval_set=(X_val_combined, y_val), early_stopping_rounds=50)

# Assuming the previous preprocessing steps have been done

# Prepare test dataset before scaling
# Drop any columns that should not be in the test set
test = test.drop(columns=['sales'], errors='ignore')  # Use errors='ignore' in case 'sales' is not present

# Ensure alignment of test data features with training data
test, _ = test.align(X_train, join='left', axis=1, fill_value=0)

# Transform test data using the previously fitted scaler
X_test_scaled = scaler_X.transform(test)  # Transform test data
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# Extract LSTM features for test data
with torch.no_grad():
    lstm_test_features = lstm_model(X_test_tensor).numpy()

# Combine LSTM features with the original test features
test_lstm_df = pd.DataFrame(lstm_test_features, columns=[f'lstm_feat_{i}' for i in range(lstm_test_features.shape[1])])
X_test_combined = pd.concat([test.reset_index(drop=True), test_lstm_df], axis=1)

# Make predictions on the test set with the CatBoost model
test_predictions_catboost = catboost_model.predict(X_test_combined)

# Convert predictions back from log1p scale
test['sales'] = np.expm1(test_predictions_catboost)

# Handle negative values in predictions (if any)
test['sales'] = np.where(test['sales'] < 0, 0, test['sales'])

# Prepare the submission file
submission = test[['id', 'sales']]
submission.to_csv('submission_new.csv', index=False)
print("Submission file generated.")


Epoch 1/50, Training Loss: 0.08367542177438736, Validation Loss: 0.08112664520740509, Validation RMSLE: 3.3421108722686768
Epoch 2/50, Training Loss: 0.08129365742206573, Validation Loss: 0.07882606983184814, Validation RMSLE: 3.2943825721740723
Epoch 3/50, Training Loss: 0.07899253815412521, Validation Loss: 0.07660184800624847, Validation RMSLE: 3.2475714683532715
Epoch 4/50, Training Loss: 0.07676780223846436, Validation Loss: 0.07444833964109421, Validation RMSLE: 3.201596260070801
Epoch 5/50, Training Loss: 0.07461380958557129, Validation Loss: 0.07236013561487198, Validation RMSLE: 3.156376361846924
Epoch 6/50, Training Loss: 0.07252515107393265, Validation Loss: 0.07033269107341766, Validation RMSLE: 3.1118433475494385
Epoch 7/50, Training Loss: 0.07049726694822311, Validation Loss: 0.06836258620023727, Validation RMSLE: 3.0679502487182617
Epoch 8/50, Training Loss: 0.0685267224907875, Validation Loss: 0.06644774228334427, Validation RMSLE: 3.0246782302856445
Epoch 9/50, Trainin

In [35]:
# Assuming the previous preprocessing steps have been done

# Prepare test dataset before scaling
# Drop any columns that should not be in the test set
test = test.drop(columns=['sales'], errors='ignore')  # Use errors='ignore' in case 'sales' is not present

# Ensure alignment of test data features with training data
test, _ = test.align(X_train, join='left', axis=1, fill_value=0)

# Transform test data using the previously fitted scaler
X_test_scaled = scaler_X.transform(test)  # Transform test data
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# Extract LSTM features for test data
with torch.no_grad():
    lstm_test_features = lstm_model(X_test_tensor).numpy()

# Combine LSTM features with the original test features
test_lstm_df = pd.DataFrame(lstm_test_features, columns=[f'lstm_feat_{i}' for i in range(lstm_test_features.shape[1])])
X_test_combined = pd.concat([test.reset_index(drop=True), test_lstm_df], axis=1)

# Make predictions on the test set with the CatBoost model
test_predictions_catboost = catboost_model.predict(X_test_combined)

# Convert predictions back from log1p scale
test['sales'] = np.expm1(test_predictions_catboost)

# Handle negative values in predictions (if any)
test['sales'] = np.where(test['sales'] < 0, 0, test['sales'])

# Prepare the submission file
submission = test[['id', 'sales']]
submission.to_csv('submission_new.csv', index=False)
print("Submission file generated.")


Submission file generated.
