In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
# Read the CSV file
file_path = "/kaggle/input/playground-series-s3e19/train.csv"
train_data = pd.read_csv(file_path)
train_data['date'] = pd.to_datetime(train_data['date'])

# Display the head, summary, and structure of the data
print(train_data.head())
print(train_data.describe())
print(train_data.info())

# Check for missing values
missing_values = train_data.isnull().sum()
print(missing_values)

# Boxplot by country and store
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.boxplot(train_data['num_sold'], vert=False)
plt.xlabel('Number of Units Sold')
plt.title('Boxplot by Country')
plt.subplot(1, 2, 2)
plt.boxplot(train_data['num_sold'], vert=False)
plt.xlabel('Number of Units Sold')
plt.title('Boxplot by Store')
plt.tight_layout()
plt.show()

# Grouped boxplot using seaborn
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(x='country', y='num_sold', hue='product', data=train_data)
plt.title('Boxplot of Number of Units Sold by Country, Store, and Product')
plt.xlabel('Country')
plt.ylabel('Number of Units Sold')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Grouped boxplot using pandas
grouped_data_product = train_data.groupby('product')
summary_stats_product = grouped_data_product['num_sold'].agg(['mean', 'median', 'max', 'min'])
print(summary_stats_product)

grouped_data_store = train_data.groupby('store')
summary_stats_store = grouped_data_store['num_sold'].agg(['mean', 'median', 'max', 'min'])
print(summary_stats_store)

grouped_data_country = train_data.groupby('country')
summary_stats_country = grouped_data_country['num_sold'].agg(['mean', 'median', 'max', 'min'])
print(summary_stats_country)

# Combine country, product, and store columns into a single column "combination"
train_data_combined = train_data.copy()
train_data_combined['combination'] = train_data_combined['country'] + '_' + train_data_combined['product'] + '_' + train_data_combined['store']

# Pivot_wider to get separate variables for each unique combination
train_data_wide = train_data_combined.pivot_table(index='date', columns='combination', values='num_sold', aggfunc='sum')
print(train_data_wide)

# Calculate the sum of num_sold for each date
train_data_combined = train_data_wide.sum(axis=1).reset_index(name='sum_num')

# Remove duplicated rows
train_data_combined = train_data_combined.drop_duplicates()

# Plot the time series
plt.figure(figsize=(12, 6))
plt.plot(train_data_combined['date'], train_data_combined['sum_num'])
plt.title('Time Series of Total Number of Units Sold')
plt.xlabel('Date')
plt.ylabel('Total Number of Units Sold')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Create a new variable for the day of the year (1 to 365)
train_data['day_of_year'] = pd.to_datetime(train_data['date']).dt.dayofyear

# Create a new variable for the year
train_data['year'] = pd.to_datetime(train_data['date']).dt.year

# Ignore leap days
train_data['day_of_year'] = train_data['day_of_year'] - 1 + \
    train_data.apply(lambda row: -1 if row['day_of_year'] > 59 and row['year'] % 4 == 0 else 0, axis=1)

# Take out the COVID effect for the year 2020 between days 51 and 199
train_data['day_of_year_covid_adjusted'] = train_data.apply(
    lambda row: 50 if row['year'] == 2020 and 50 < row['day_of_year'] < 200 else row['day_of_year'],
    axis=1
)

# Daily sales by product has a different pattern on odd and even years
train_data['year_even'] = train_data['year'] % 2

# Not adding holidays to this version

# Extract the weekday from the date with label
train_data['weekday'] = pd.to_datetime(train_data['date']).dt.day_name()

# Print the updated DataFrame
print(train_data)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming you have already loaded the train_data DataFrame and performed necessary data transformations

# Create time-based features
train_data['day_of_week'] = pd.to_datetime(train_data['date']).dt.dayofweek
train_data['month'] = pd.to_datetime(train_data['date']).dt.month
train_data['year'] = pd.to_datetime(train_data['date']).dt.year

# Separate features and target variable
X = train_data[['day_of_week', 'month', 'year', 'day_of_year_covid_adjusted', 'year_even']]
y = train_data['num_sold']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_model.predict(X_val)

# Evaluate the model on the validation set
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("Validation RMSE:", rmse)

In [None]:
# Assuming you have already loaded and prepared the test dataset in the test_data DataFrame
file_path = "/kaggle/input/playground-series-s3e19/test.csv"
test_data = pd.read_csv(file_path)
test_data['date'] = pd.to_datetime(test_data['date'])# Create time-based features for the test set
# Create a new variable for the day of the year (1 to 365)
test_data['day_of_year'] = pd.to_datetime(test_data['date']).dt.dayofyear

# Create a new variable for the year
test_data['year'] = pd.to_datetime(test_data['date']).dt.year

# Ignore leap days
test_data['day_of_year'] = test_data['day_of_year'] - 1 + \
    test_data.apply(lambda row: -1 if row['day_of_year'] > 59 and row['year'] % 4 == 0 else 0, axis=1)

# Take out the COVID effect for the year 2020 between days 51 and 199
test_data['day_of_year_covid_adjusted'] = test_data.apply(
    lambda row: 50 if row['year'] == 2020 and 50 < row['day_of_year'] < 200 else row['day_of_year'],
    axis=1
)

# Daily sales by product has a different pattern on odd and even years
test_data['year_even'] = test_data['year'] % 2

# Not adding holidays to this version

# Extract the weekday from the date with label
test_data['weekday'] = pd.to_datetime(test_data['date']).dt.day_name()

test_data['day_of_week'] = pd.to_datetime(test_data['date']).dt.dayofweek
test_data['month'] = pd.to_datetime(test_data['date']).dt.month
test_data['year'] = pd.to_datetime(test_data['date']).dt.year

# Extract features for the test set
X_test = test_data[['day_of_week', 'month', 'year', 'day_of_year_covid_adjusted', 'year_even']]

# Make predictions on the test set
test_data['num_sold'] = rf_model.predict(X_test)

# Plot the predicted values
plt.figure(figsize=(10, 6))
plt.plot(test_data['date'], test_data['num_sold'], label='Predicted', color='blue')
plt.xlabel('Date')
plt.ylabel('Predicted Number of Units Sold')
plt.title('Predicted Values for the Test Set')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Read the sample submission file
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e19/sample_submission.csv')

# Replace the 'num_sold' column in the sample submission with the predicted values from 'test_data'
sample_submission['num_sold'] = test_data['num_sold']

# Save the updated sample_submission DataFrame to a new CSV file for submission
sample_submission.to_csv('submission.csv', index=False)