# Modelling

In this notebook, the hypothesis obtained: "rating, item_promotion_discount, ad_cost_eur", provides significant contribution to the success of a product is tested by ML models. 
1. The data is aggregated on a monthly level for each product. An ARIMA model is created to forecast the sales_rank, for the last month available for each product, only based on the historic sales_rank data from the previous months. This will act as a baseline model. 
2. Then a Random Forest model is trained with the chosen features data from the past to predict the sales_rank of the last_month. 
3. The performance of the models are compared based on the MAPE to find the outcome of the hypothesis.

In [215]:
# Import packages
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from statsmodels.tsa.arima.model import ARIMA

from math import sqrt

from sklearn.ensemble import RandomForestRegressor

In [166]:
# Imported the data
sales_rank_rating_df = pd.read_csv('Data/product_dataset.csv')

In [168]:
sales_rank_rating_df.head()

Unnamed: 0.1,Unnamed: 0,date,asin,sales_rank,rating,ad_cost_eur,item_promotion_discount
0,0,2020-08-01,018cd764d4b819f3ab1bbfd3562796f0,-1,4.0,13.432,0.0
1,1,2020-08-21,018cd764d4b819f3ab1bbfd3562796f0,-1,4.0,0.0,0.0
2,2,2020-08-30,018cd764d4b819f3ab1bbfd3562796f0,6472,4.0,0.0,0.0
3,3,2020-08-31,018cd764d4b819f3ab1bbfd3562796f0,12946,4.0,0.0,0.0
4,4,2020-09-01,018cd764d4b819f3ab1bbfd3562796f0,14498,4.0,9.198,0.0


In [169]:
# Dropping rows where sales_rank = -1
sales_rank_rating_df = sales_rank_rating_df[sales_rank_rating_df.sales_rank != -1]

In [170]:
sales_rank_rating_df.head()

Unnamed: 0.1,Unnamed: 0,date,asin,sales_rank,rating,ad_cost_eur,item_promotion_discount
2,2,2020-08-30,018cd764d4b819f3ab1bbfd3562796f0,6472,4.0,0.0,0.0
3,3,2020-08-31,018cd764d4b819f3ab1bbfd3562796f0,12946,4.0,0.0,0.0
4,4,2020-09-01,018cd764d4b819f3ab1bbfd3562796f0,14498,4.0,9.198,0.0
5,5,2020-09-06,018cd764d4b819f3ab1bbfd3562796f0,16051,4.0,0.0,0.0
6,6,2020-09-07,018cd764d4b819f3ab1bbfd3562796f0,17604,4.0,0.0,0.0


In [171]:
# Convert the date as index
sales_rank_rating_df.date = pd.DatetimeIndex(sales_rank_rating_df.date)

In [173]:
# Define helper function
def aggregate(group):
    per = group.date.dt.to_period("M") 
    g = group.groupby(per)
    
    return g.mean()


# Group by asin and aggreagte values on a month level
monthly_sales_rank_rating_df = sales_rank_rating_df.groupby(['asin']).apply(lambda group: aggregate(group))

# Reset index
monthly_sales_rank_rating_df.reset_index(drop=False, inplace=True)

In [175]:
monthly_sales_rank_rating_df.head()

Unnamed: 0.1,asin,date,Unnamed: 0,sales_rank,rating,ad_cost_eur,item_promotion_discount
0,018cd764d4b819f3ab1bbfd3562796f0,2020-08,2.5,9709.0,4.0,0.0,0.0
1,018cd764d4b819f3ab1bbfd3562796f0,2020-09,8.5,41124.2,4.0,0.9198,0.0
2,018cd764d4b819f3ab1bbfd3562796f0,2020-10,18.0,64523.111111,4.0,0.270911,0.0
3,018cd764d4b819f3ab1bbfd3562796f0,2020-11,30.5,92202.875,4.0,0.303862,0.0
4,018cd764d4b819f3ab1bbfd3562796f0,2020-12,47.0,49602.176471,3.601961,0.115941,0.0


In [176]:
# Shape of the data frame after aggregation
monthly_sales_rank_rating_df.shape

(5209, 7)

In [177]:
# Convert sales_rank as int type
monthly_sales_rank_rating_df['sales_rank'] = monthly_sales_rank_rating_df['sales_rank'].astype(int)

### ARIMA
This model acts as a baseline for our hypothesis. Here, one model is trained for each asin using the data from all the months and the sales_rank for the last month is forecasted.

In [179]:
# Forecasted dataframe initialized
forecasted_df = pd.DataFrame()

# Define helper function
def forecasting(group):
    
    """
    This function trains a model for the given grouped data (each asin) from the data from the first N-1 months and predicts the
    sales_rank of the last (N) month using ARIMA. The default parameters are used since the goal is to test a hypothesis.
    """
    
    # Change date type
    group['date'] = group['date'].astype(str)
    group['date'] = pd.to_datetime(group['date'])
    
    # Find the last month
    last_month = group['date'].iloc[-1]
    
    # Train-test split
    train = group[group['date'] != last_month]
    test = group[group['date'] == last_month]
    
    # Reset the index 
    group.reset_index(drop=False, inplace=True)
    
    # Set date as index and Get the sales rank series
    sales_rank_ds = train.set_index('date')['sales_rank']
    
    # If length of the series is less than 2, return the same series or fit the model
    if sales_rank_ds.size == 0:
        
        # Append the forecasted series to the test set
        test['forecasted_sales_rank'] = 0 
        return test
    
    if sales_rank_ds.size == 1 :
        
        # Append the forecasted series to the test set
        test['forecasted_sales_rank'] = train['sales_rank']
        return test
    
    # Creating and fitting the model
    model = ARIMA(sales_rank_ds)
    model_fit = model.fit()    
    
    # Forecast the value
    forecasted_value = model_fit.forecast()
    
    # Append the forecasted series to the test set
    test['forecasted_sales_rank'] = forecasted_value.to_list()
    
    return test
    

# Group by asin and predict the sales_rank of the last_month
forecasted_sales_rank_df = forecasted_df.append(monthly_sales_rank_rating_df.groupby(['asin']).apply(lambda group: forecasting(group)))

# Reset index
forecasted_sales_rank_df.reset_index(drop=True, inplace=True)

In [181]:
# Drop rows which contains forecasted_sales_rank = 0
forecasted_sales_rank_df = forecasted_sales_rank_df[forecasted_sales_rank_df['forecasted_sales_rank'] != 0]

In [182]:
# Calculate the mean_average_percentage error for the forecasted and actual sales_rank for each asin
forecasted_sales_rank_df['MAPE'] = 100 * abs(
    (forecasted_sales_rank_df['sales_rank'] - forecasted_sales_rank_df['forecasted_sales_rank']) / forecasted_sales_rank_df['sales_rank'])

In [183]:
forecasted_sales_rank_df.head()

Unnamed: 0.1,asin,date,Unnamed: 0,sales_rank,rating,ad_cost_eur,item_promotion_discount,forecasted_sales_rank,MAPE
0,018cd764d4b819f3ab1bbfd3562796f0,2021-12-01,172.0,66864,4.4,0.0,0.0,56844.75,14.984521
1,0245e67a746d2bf134c15331fe42c109,2021-12-01,735.5,94340,4.2,0.006844,0.0,97902.043478,3.775751
2,02ffd26ebce197557ba930151e94a4da,2021-12-01,930.5,154635,3.7,0.0,0.0,506614.066667,227.619275
4,0372f815d983c2cf73fb108a7c0f446b,2021-12-01,1519.0,88566,4.0,0.266137,0.068571,54022.887117,39.002679
5,03a53ff277b1dc6fb7a6981dcd7bcefc,2021-12-01,1858.5,1146737,3.5,0.0,0.0,693373.428571,39.535096


In [185]:
# The avergae MAPE for the whole forecast
mape = forecasted_sales_rank_df['MAPE'].sum() / forecasted_sales_rank_df.shape[0]
mape

74.76405541057832

In [186]:
# Save as a CSV file
forecasted_sales_rank_df.to_csv('Data/forescasted_sales_rank.csv')

## Random Forest 

This model is used to test if the features: 'rating, ad_cost_eur, item_promotion_discount' contributes to the prediction of sales_rank

In [219]:
# Change date type
monthly_sales_rank_rating_df['date'] = monthly_sales_rank_rating_df['date'].astype(str)
monthly_sales_rank_rating_df['date'] = pd.to_datetime(monthly_sales_rank_rating_df['date'])

In [220]:
# Make a copy of the data
df = monthly_sales_rank_rating_df.copy()

In [221]:
# Extract monthly data
df['date'] = pd.to_datetime(df['date'], format = '%Y%m%d')
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month

In [222]:
# Create one-hot encoding for the month and year, so that the date information can be fed to the model
df = pd.get_dummies(df, columns=["month", "year"])

In [223]:
df.head()

Unnamed: 0.1,asin,date,Unnamed: 0,sales_rank,rating,ad_cost_eur,item_promotion_discount,month_1,month_2,month_3,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2020,year_2021
0,018cd764d4b819f3ab1bbfd3562796f0,2020-08-01,2.5,9709,4.0,0.0,0.0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,018cd764d4b819f3ab1bbfd3562796f0,2020-09-01,8.5,41124,4.0,0.9198,0.0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,018cd764d4b819f3ab1bbfd3562796f0,2020-10-01,18.0,64523,4.0,0.270911,0.0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,018cd764d4b819f3ab1bbfd3562796f0,2020-11-01,30.5,92202,4.0,0.303862,0.0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,018cd764d4b819f3ab1bbfd3562796f0,2020-12-01,47.0,49602,3.601961,0.115941,0.0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [239]:
# Drop all na
df.dropna(inplace=True)

In [264]:
predicted_df = pd.DataFrame()

# Define helper function
def prediction(group):
    """
    This function trains a random_forest model for the given grouped data (each asin) with the features 
    from the data from the first N-1 months and predicts the sales_rank of the last (N) month. 
    The default parameters are used since the goal is to test a hypothesis.
    """
    
    # Change date type
    group['date'] = group['date'].astype(str)
    group['date'] = pd.to_datetime(group['date'])
    
    # Find the last month
    last_month = group['date'].iloc[-1]
    
    # Train-test split
    train = group[group['date'] != last_month]
    test = group[group['date'] == last_month]
    
    # Reset the index 
    train.reset_index(drop=False, inplace=True)
    test.reset_index(drop=False, inplace=True)
    
    # Drop unwanted columns
    train.drop(['Unnamed: 0', 'date'], axis=1, inplace=True)
    test.drop(['Unnamed: 0', 'date'], axis=1, inplace=True)
    
    # Target variable
    y_train = train['sales_rank']
    y_test = test[['asin', 'sales_rank']]
    
    # Features
    X_train = train.drop(['index', 'asin', 'sales_rank'], axis=1)
    X_test = test.drop(['index', 'asin', 'sales_rank'], axis=1)
    
    if y_train.size == 0:
        
        # Append the forecasted series to the test set
        y_test['predicted_sales_rank'] = 0 
        
        return y_test
    
    # Model and fit
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    # Predict the sales_rank
    y_test['predicted_sales_rank'] = model.predict(X_test)
    
    # Calculate the feature importance
    feature_importance = pd.DataFrame({
        "Feature": X_train.columns,
        "Importances": model.feature_importances_
    })
   
    # Unpivot and add the feature importance to the dataframe that is returned
    temp_df = feature_importance.pivot(columns='Feature', values='Importances')
    temp_df = pd.concat([temp_df[col].dropna().reset_index(drop=True) for col in temp_df], axis=1)
    temp_df['asin'] = y_test['asin']
    
    # Merge the dataframes
    y_test = y_test.merge(temp_df, on='asin', how="right")
    
    return y_test
    

# Group by asin and predict the last month sales_rank of the asin
predicted_sales_rank_df = predicted_df.append(df.groupby(['asin']).apply(lambda group: prediction(group)))

# Reset index
predicted_sales_rank_df.reset_index(drop=True, inplace=True)

In [267]:
predicted_sales_rank_df.head()

Unnamed: 0,asin,sales_rank,predicted_sales_rank,ad_cost_eur,item_promotion_discount,month_1,month_10,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,rating,year_2020,year_2021
0,018cd764d4b819f3ab1bbfd3562796f0,66864,63054.3,0.207166,0.014213,0.09217104,0.039768,0.06190142,0.004982794,0.009148,0.019992,0.013498,0.005392,0.002311,0.015407,0.129149,0.009626,0.270904,0.058282,0.046089
1,0245e67a746d2bf134c15331fe42c109,94340,108736.11,0.323278,0.079,0.05058648,0.00708,0.001853818,0.00331833,0.06251,0.005718,0.04099,0.125316,0.006018,0.018978,0.030593,0.011879,0.20506,0.015924,0.011899
2,02ffd26ebce197557ba930151e94a4da,154635,318847.83,0.209696,0.006827,9.323448e-08,0.00012,0.0002344201,9.852192e-07,4.2e-05,0.001759,4e-06,7.4e-05,4.4e-05,3.7e-05,4.5e-05,0.000869,0.779838,0.000207,0.000202
3,0372f815d983c2cf73fb108a7c0f446b,88566,76190.67,0.132155,0.354924,0.000335201,0.012066,0.06377164,2.212321e-05,0.000103,0.008861,0.000539,0.003074,0.002812,0.005416,0.001335,0.004504,0.409485,0.00018,0.000419
4,03a53ff277b1dc6fb7a6981dcd7bcefc,1146737,859768.09,0.033629,0.0,0.007956781,0.003734,6.602886e-07,0.0004587552,0.008083,0.002059,0.018376,0.008117,0.004627,0.002274,0.000655,0.001692,0.463234,0.235093,0.210012


In [268]:
# Calculate the MAPE
predicted_sales_rank_df['MAPE'] = 100 * abs(
    (predicted_sales_rank_df['sales_rank'] - predicted_sales_rank_df['predicted_sales_rank']) / predicted_sales_rank_df['sales_rank'])

In [269]:
predicted_sales_rank_df.head()

Unnamed: 0,asin,sales_rank,predicted_sales_rank,ad_cost_eur,item_promotion_discount,month_1,month_10,month_11,month_12,month_2,...,month_4,month_5,month_6,month_7,month_8,month_9,rating,year_2020,year_2021,MAPE
0,018cd764d4b819f3ab1bbfd3562796f0,66864,63054.3,0.207166,0.014213,0.09217104,0.039768,0.06190142,0.004982794,0.009148,...,0.013498,0.005392,0.002311,0.015407,0.129149,0.009626,0.270904,0.058282,0.046089,5.697685
1,0245e67a746d2bf134c15331fe42c109,94340,108736.11,0.323278,0.079,0.05058648,0.00708,0.001853818,0.00331833,0.06251,...,0.04099,0.125316,0.006018,0.018978,0.030593,0.011879,0.20506,0.015924,0.011899,15.259816
2,02ffd26ebce197557ba930151e94a4da,154635,318847.83,0.209696,0.006827,9.323448e-08,0.00012,0.0002344201,9.852192e-07,4.2e-05,...,4e-06,7.4e-05,4.4e-05,3.7e-05,4.5e-05,0.000869,0.779838,0.000207,0.000202,106.193831
3,0372f815d983c2cf73fb108a7c0f446b,88566,76190.67,0.132155,0.354924,0.000335201,0.012066,0.06377164,2.212321e-05,0.000103,...,0.000539,0.003074,0.002812,0.005416,0.001335,0.004504,0.409485,0.00018,0.000419,13.973003
4,03a53ff277b1dc6fb7a6981dcd7bcefc,1146737,859768.09,0.033629,0.0,0.007956781,0.003734,6.602886e-07,0.0004587552,0.008083,...,0.018376,0.008117,0.004627,0.002274,0.000655,0.001692,0.463234,0.235093,0.210012,25.024823


In [270]:
# Reordering the columns
predicted_sales_rank_df = predicted_sales_rank_df[['asin', 'sales_rank', 'predicted_sales_rank', 'MAPE', 'rating', 'ad_cost_eur', 'item_promotion_discount',
                                                   'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
                                                   'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
                                                   'month_11', 'month_12', 'year_2020', 'year_2021']]

In [271]:
predicted_sales_rank_df.head()

Unnamed: 0,asin,sales_rank,predicted_sales_rank,MAPE,rating,ad_cost_eur,item_promotion_discount,month_1,month_2,month_3,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2020,year_2021
0,018cd764d4b819f3ab1bbfd3562796f0,66864,63054.3,5.697685,0.270904,0.207166,0.014213,0.09217104,0.009148,0.019992,...,0.005392,0.002311,0.015407,0.129149,0.009626,0.039768,0.06190142,0.004982794,0.058282,0.046089
1,0245e67a746d2bf134c15331fe42c109,94340,108736.11,15.259816,0.20506,0.323278,0.079,0.05058648,0.06251,0.005718,...,0.125316,0.006018,0.018978,0.030593,0.011879,0.00708,0.001853818,0.00331833,0.015924,0.011899
2,02ffd26ebce197557ba930151e94a4da,154635,318847.83,106.193831,0.779838,0.209696,0.006827,9.323448e-08,4.2e-05,0.001759,...,7.4e-05,4.4e-05,3.7e-05,4.5e-05,0.000869,0.00012,0.0002344201,9.852192e-07,0.000207,0.000202
3,0372f815d983c2cf73fb108a7c0f446b,88566,76190.67,13.973003,0.409485,0.132155,0.354924,0.000335201,0.000103,0.008861,...,0.003074,0.002812,0.005416,0.001335,0.004504,0.012066,0.06377164,2.212321e-05,0.00018,0.000419
4,03a53ff277b1dc6fb7a6981dcd7bcefc,1146737,859768.09,25.024823,0.463234,0.033629,0.0,0.007956781,0.008083,0.002059,...,0.008117,0.004627,0.002274,0.000655,0.001692,0.003734,6.602886e-07,0.0004587552,0.235093,0.210012


In [272]:
# Average MAPE
mape = predicted_sales_rank_df['MAPE'].sum() / predicted_sales_rank_df.shape[0]
mape

45.149741046373876

In [273]:
# Save as a CSV file
predicted_sales_rank_df.to_csv('Data/predicted_sales_rank.csv')