In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Perform data preprocessing

# Convert the date column to datetime format
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

# Extract additional features from the date column
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['dayofweek'] = train_df['date'].dt.dayofweek

test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['dayofweek'] = test_df['date'].dt.dayofweek

# Drop unnecessary columns
train_df = train_df.drop(['date'], axis=1)
test_df = test_df.drop(['date'], axis=1)

# Split the data into features and target variable
X = train_df.drop('sales', axis=1)
y = train_df['sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

In [7]:
# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model using Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

# Make predictions on the test set
# test_df = test_df.drop('id', axis=1)  # Remove the 'id' column
test_pred = model.predict(test_df)

# Prepare submission file
submission = pd.read_csv('sample_submission.csv')
submission['sales'] = test_pred
submission.to_csv('submission.csv', index=False)

Validation RMSE: 8.453601084945717
