In [3]:
import pandas as pd

# Load the training data
df = pd.read_csv('../data/train.csv')

# Display the first 5 rows
df.head()

# Get information about the data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB


In [4]:
# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# Check the info again to confirm the change
df.info()

# Let's see the date range
print(f"Data ranges from {df['date'].min()} to {df['date'].max()}")# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# Check the info again to confirm the change
df.info()

# Let's see the date range
print(f"Data ranges from {df['date'].min()} to {df['date'].max()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    913000 non-null  datetime64[ns]
 1   store   913000 non-null  int64         
 2   item    913000 non-null  int64         
 3   sales   913000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 27.9 MB
Data ranges from 2013-01-01 00:00:00 to 2017-12-31 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    913000 non-null  datetime64[ns]
 1   store   913000 non-null  int64         
 2   item    913000 non-null  int64         
 3   sales   913000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 27.9 MB
Data ranges from 2013-01-01 00:00:00 to 2017-12-31 00:00:00


In [5]:
# Create time-based features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek # Monday=0, Sunday=6

# Let's look at the new features
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,dayofweek
0,2013-01-01,1,1,13,2013,1,1,1
1,2013-01-02,1,1,11,2013,1,2,2
2,2013-01-03,1,1,14,2013,1,3,3
3,2013-01-04,1,1,13,2013,1,4,4
4,2013-01-05,1,1,10,2013,1,5,5


In [6]:
# --- Advanced Feature Engineering ---

print("Starting advanced feature engineering...")

# It's crucial to sort by date to ensure lags/rolling windows are correct
df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

# Lag Features (sales from previous time periods)
def create_lags(df, lags):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store', 'item'])['sales'].shift(lag)
    return df

# Let's create lags for the past week, two weeks, month, and year
df = create_lags(df, lags=[7, 14, 28, 365])
print("Lag features created.")

# Rolling Window Features (stats over recent time periods)
def create_rolling_features(df, windows):
    for window in windows:
        # We use .transform() to keep the original DataFrame shape
        df[f'sales_rolling_mean_{window}'] = df.groupby(['store', 'item'])['sales'].transform(
            lambda s: s.shift(1).rolling(window, min_periods=1).mean() # shift(1) to avoid using current day's sales
        )
        df[f'sales_rolling_std_{window}'] = df.groupby(['store', 'item'])['sales'].transform(
            lambda s: s.shift(1).rolling(window, min_periods=1).std()
        )
    return df

# Create rolling stats for the past week and month
df = create_rolling_features(df, windows=[7, 28])
print("Rolling features created.")

# Handle the NaN values created by lags and rolling windows
# For tree-based models, filling with 0 is a reasonable strategy
df.fillna(0, inplace=True)
print("NaN values filled.")

# Display the new features
df.head(10)

Starting advanced feature engineering...
Lag features created.
Rolling features created.
NaN values filled.


Unnamed: 0,date,store,item,sales,year,month,day,dayofweek,sales_lag_7,sales_lag_14,sales_lag_28,sales_lag_365,sales_rolling_mean_7,sales_rolling_std_7,sales_rolling_mean_28,sales_rolling_std_28
0,2013-01-01,1,1,13,2013,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-01-02,1,1,11,2013,1,2,2,0.0,0.0,0.0,0.0,13.0,0.0,13.0,0.0
2,2013-01-03,1,1,14,2013,1,3,3,0.0,0.0,0.0,0.0,12.0,1.414214,12.0,1.414214
3,2013-01-04,1,1,13,2013,1,4,4,0.0,0.0,0.0,0.0,12.666667,1.527525,12.666667,1.527525
4,2013-01-05,1,1,10,2013,1,5,5,0.0,0.0,0.0,0.0,12.75,1.258306,12.75,1.258306
5,2013-01-06,1,1,12,2013,1,6,6,0.0,0.0,0.0,0.0,12.2,1.643168,12.2,1.643168
6,2013-01-07,1,1,10,2013,1,7,0,0.0,0.0,0.0,0.0,12.166667,1.47196,12.166667,1.47196
7,2013-01-08,1,1,9,2013,1,8,1,13.0,0.0,0.0,0.0,11.857143,1.573592,11.857143,1.573592
8,2013-01-09,1,1,12,2013,1,9,2,11.0,0.0,0.0,0.0,11.285714,1.799471,11.5,1.772811
9,2013-01-10,1,1,9,2013,1,10,3,14.0,0.0,0.0,0.0,11.428571,1.812654,11.555556,1.666667


In [7]:
# Create train and test sets based on time
train = df[df['date'] < '2017-01-01']
test = df[df['date'] >= '2017-01-01']

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (730500, 16)
Test shape: (182500, 16)


In [8]:
# Define which columns are our features (X) and which is our target (y)
features = [
    'store',
    'item',
    'year',
    'month',
    'day',
    'dayofweek',
    'sales_lag_7',
    'sales_lag_14',
    'sales_lag_28',
    'sales_lag_365',
    'sales_rolling_mean_7',
    'sales_rolling_std_7',
    'sales_rolling_mean_28',
    'sales_rolling_std_28'
]
target = 'sales'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [9]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Create the LightGBM model
# We use some standard parameters for good performance
model = lgb.LGBMRegressor(
    objective='regression_l1', # MAE
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train the model
model.fit(X_train, y_train,
          eval_set=[(X_test, y_test)],
          eval_metric='mae',
          callbacks=[lgb.early_stopping(100, verbose=True)])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1858
[LightGBM] [Info] Number of data points in the train set: 730500, number of used features: 14
[LightGBM] [Info] Start training from score 45.000000
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's l1: 6.09614


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'regression_l1'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
import joblib

# Save the model to a file
joblib.dump(model, '../lgbm_model.joblib')
print("Model saved successfully!")

Model saved successfully!
