In [1]:
import pandas as pd

from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the dataset
file_path = './train_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,date,item_name,total_amount_sold,avg_bottle_price,total_volume_sold_liters,total_sale_dollars,avg_bottle_profit,day_of_week,week_of_year,month,year,ma7_total_amount_sold,ma7_avg_bottle_price,ma7_total_sale_dollars,ma30_total_amount_sold,ma30_avg_bottle_price,ma30_total_sale_dollars
0,2021-01-04,1800 ANEJO,13,33.62,9.75,437.06,11.21,2,1,1,2021,13.0,33.62,437.06,13.0,33.62,437.06
1,2021-01-05,1800 ANEJO,4,33.62,3.0,134.48,11.21,3,1,1,2021,8.5,33.62,285.77,8.5,33.62,285.77
2,2021-01-06,1800 ANEJO,2,33.62,1.5,67.24,11.21,4,1,1,2021,6.333333,33.62,212.926667,6.333333,33.62,212.926667
3,2021-01-11,1800 ANEJO,3,33.62,2.25,100.86,11.21,2,2,1,2021,5.5,33.62,184.91,5.5,33.62,184.91
4,2021-01-12,1800 ANEJO,7,33.62,5.25,235.34,11.21,3,2,1,2021,5.8,33.62,194.996,5.8,33.62,194.996


In [3]:
# Handle missing values
data.fillna(method='ffill', inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['item_name'] = label_encoder.fit_transform(data['item_name'])

# Features and target variable
target = data['total_amount_sold']
features = data.drop(columns=['date', 'total_amount_sold'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a pipeline with scaling and the regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)

mae

8.089479456013873

In [None]:
# Create a new dataset for the next 7 days
last_date = pd.to_datetime(data['date'].max())
next_dates = [last_date + timedelta(days=i) for i in range(1, 8)]

# Use the most recent data for the other features
latest_data = data.iloc[-1:].copy()
forecast_data = pd.DataFrame()

for date in next_dates:
    new_data = latest_data.copy()
    new_data['date'] = date
    new_data['day_of_week'] = date.dayofweek
    new_data['week_of_year'] = date.isocalendar()[1]
    new_data['month'] = date.month
    new_data['year'] = date.year
    forecast_data = pd.concat([forecast_data, new_data], ignore_index=True)

# Drop the target variable
forecast_features = forecast_data.drop(['date', 'total_amount_sold'], axis=1)

# Predict the total amount sold for the next 7 days
forecast_predictions = pipeline.predict(forecast_features)

# Add the predictions to the forecast data
forecast_data['total_amount_sold'] = forecast_predictions

forecast_data[['date', 'total_amount_sold']]