In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('orders_train.txt', sep=';')
print('Number of records between {} to {}: {}'.format(df['orderDate'].iloc[0], df['orderDate'].iloc[-1], len(df)))
print('Number of attributes corresponding to a single record {}:'.format(df.shape[1]-1))
df.head()

Number of records between 2014-01-01 to 2015-09-30: 2325165
Number of attributes corresponding to a single record 14:


Unnamed: 0,orderID,orderDate,articleID,colorCode,sizeCode,productGroup,quantity,price,rrp,voucherID,voucherAmount,customerID,deviceID,paymentMethod,returnQuantity
0,a1000001,2014-01-01,i1000382,1972,44,3.0,1,10.0,29.99,0,0.0,c1010575,2,BPRG,0
1,a1000001,2014-01-01,i1000550,3854,44,3.0,1,20.0,39.99,0,0.0,c1010575,2,BPRG,0
2,a1000002,2014-01-01,i1001991,2974,38,8.0,1,35.0,49.99,0,0.0,c1045905,4,BPRG,0
3,a1000002,2014-01-01,i1001999,1992,38,8.0,1,49.99,49.99,0,0.0,c1045905,4,BPRG,1
4,a1000003,2014-01-01,i1001942,1968,42,8.0,1,10.0,35.99,0,0.0,c1089295,2,PAYPALVC,0


In [None]:
#check NA values
print(df.isna().sum())
#drop d values since not a significant number of records contain NA
df = df.dropna()

## inconsistent records
drop_idx = df[(df['quantity']==0) |  (df['price']==0) | (df['quantity']<df['returnQuantity']) ].index
df = df.drop(drop_idx)
df.shape


orderID             0
orderDate           0
articleID           0
colorCode           0
sizeCode            0
productGroup      351
quantity            0
price               0
rrp               351
voucherID           6
voucherAmount       0
customerID          0
deviceID            0
paymentMethod       0
returnQuantity      0
dtype: int64


(2277968, 15)

In [None]:
# Aggregating total quantity per order
df['total_quantity_per_order'] = df.groupby('orderID')['quantity'].transform('sum')

# Mean recommended retail price per article
df['mean_rrp_per_article'] = df.groupby('articleID')['rrp'].transform('mean')

# Total number of orders per customer
df['total_orders_per_customer'] = df.groupby('customerID')['orderID'].transform('nunique')


# Decompose colorCode into individual digits (assuming all colorCodes are four digits)
df['colorCode_1'] = df['colorCode'].apply(lambda x: int(str(x)[0]))
df['colorCode_2'] = df['colorCode'].apply(lambda x: int(str(x)[1]))
df['colorCode_3'] = df['colorCode'].apply(lambda x: int(str(x)[2]))
df['colorCode_4'] = df['colorCode'].apply(lambda x: int(str(x)[3]))

# Customer likelihood of returning items
df['likelihood_of_returning'] = df.groupby('customerID')['returnQuantity'].transform('sum') / df.groupby('customerID')['quantity'].transform('sum')

# Price level categorization based on rrp
bins = [0, 20, 50, 100, float('inf')]  # Define bins for price levels
labels = ['cheap', 'regular', 'expensive', 'luxury']
df['price_level'] = pd.cut(df['rrp'], bins=bins, labels=labels, include_lowest=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2277968 entries, 0 to 2325164
Data columns (total 24 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   orderID                    object  
 1   orderDate                  object  
 2   articleID                  object  
 3   colorCode                  int64   
 4   sizeCode                   object  
 5   productGroup               float64 
 6   quantity                   int64   
 7   price                      float64 
 8   rrp                        float64 
 9   voucherID                  object  
 10  voucherAmount              float64 
 11  customerID                 object  
 12  deviceID                   int64   
 13  paymentMethod              object  
 14  returnQuantity             int64   
 15  total_quantity_per_order   int64   
 16  mean_rrp_per_article       float64 
 17  total_orders_per_customer  int64   
 18  colorCode_1                int64   
 19  colorCode_2               

In [None]:
df[['productGroup', 'deviceID']] = df[['productGroup', 'deviceID']].astype(str)

In [None]:
df['price_level'] = df['price_level'].astype(str)

In [None]:
# Define columns for one-hot encoding
OHE_feats = ['productGroup', 'deviceID', 'paymentMethod','price_level']

# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(df[OHE_feats])

# Create DataFrame with one-hot encoded features
df_OHE = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(OHE_feats))

# Reset indices of the original data and df_OHE to ensure they are aligned
df.reset_index(drop=True, inplace=True)
df_OHE.reset_index(drop=True, inplace=True)

# Drop the original categorical columns from 'data'
df.drop(columns=OHE_feats, inplace=True)

# Concatenate the one-hot encoded DataFrame columns to the original DataFrame
data = pd.concat([df, df_OHE], axis=1)



In [None]:
df.isna().sum()

orderID                      0
orderDate                    0
articleID                    0
colorCode                    0
sizeCode                     0
quantity                     0
price                        0
rrp                          0
voucherID                    0
voucherAmount                0
customerID                   0
returnQuantity               0
total_quantity_per_order     0
mean_rrp_per_article         0
total_orders_per_customer    0
colorCode_1                  0
colorCode_2                  0
colorCode_3                  0
colorCode_4                  0
likelihood_of_returning      0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277968 entries, 0 to 2277967
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   orderID                    object 
 1   orderDate                  object 
 2   articleID                  object 
 3   colorCode                  int64  
 4   sizeCode                   object 
 5   quantity                   int64  
 6   price                      float64
 7   rrp                        float64
 8   voucherID                  object 
 9   voucherAmount              float64
 10  customerID                 object 
 11  returnQuantity             int64  
 12  total_quantity_per_order   int64  
 13  mean_rrp_per_article       float64
 14  total_orders_per_customer  int64  
 15  colorCode_1                int64  
 16  colorCode_2                int64  
 17  colorCode_3                int64  
 18  colorCode_4                int64  
 19  likelihood_of_returning    float64
dtypes:

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform 'sizeCode' column
data['sizeCode'] = label_encoder.fit_transform(data['sizeCode'])

# Check the mapping of original categories to encoded labels
print("Mapping of original categories to encoded labels:")
for original_category, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{original_category} -> {encoded_label}")

# Print the encoded 'sizeCode' column
print("\nEncoded 'sizeCode' column:")
print(data['sizeCode'])

Mapping of original categories to encoded labels:
100 -> 0
24 -> 1
25 -> 2
26 -> 3
27 -> 4
28 -> 5
29 -> 6
30 -> 7
31 -> 8
32 -> 9
33 -> 10
34 -> 11
36 -> 12
38 -> 13
40 -> 14
42 -> 15
44 -> 16
75 -> 17
80 -> 18
85 -> 19
90 -> 20
95 -> 21
A -> 22
I -> 23
L -> 24
M -> 25
S -> 26
XL -> 27
XS -> 28

Encoded 'sizeCode' column:
0          16
1          16
2          13
3          13
4          15
           ..
2277963    14
2277964    15
2277965    13
2277966    13
2277967    13
Name: sizeCode, Length: 2277968, dtype: int64


In [None]:
data.head()

Unnamed: 0,orderID,orderDate,articleID,colorCode,sizeCode,quantity,price,rrp,voucherID,voucherAmount,...,paymentMethod_CBA,paymentMethod_KGRG,paymentMethod_KKE,paymentMethod_NN,paymentMethod_PAYPALVC,paymentMethod_RG,paymentMethod_VORAUS,price_level_expensive,price_level_luxury,price_level_regular
0,a1000001,2014-01-01,i1000382,1972,16,1,10.0,29.99,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,a1000001,2014-01-01,i1000550,3854,16,1,20.0,39.99,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,a1000002,2014-01-01,i1001991,2974,13,1,35.0,49.99,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,a1000002,2014-01-01,i1001999,1992,13,1,49.99,49.99,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,a1000003,2014-01-01,i1001942,1968,15,1,10.0,35.99,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
data[['total_quantity_per_order','mean_rrp_per_article','total_orders_per_customer']].head(20)

Unnamed: 0,total_quantity_per_order,mean_rrp_per_article,total_orders_per_customer
0,2,29.99,2
1,2,39.99,2
2,2,49.99,1
3,2,49.99,1
4,4,35.99,1
5,4,35.99,1
6,4,39.99,1
7,4,39.99,1
8,1,89.99,1
9,3,39.99,4


In [None]:
final = data.drop(columns={'orderID', 'voucherID', 'orderDate', 'customerID', 'price', 'articleID', 'colorCode'})
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277968 entries, 0 to 2277967
Data columns (total 43 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   sizeCode                   int64  
 1   quantity                   int64  
 2   rrp                        float64
 3   voucherAmount              float64
 4   returnQuantity             int64  
 5   total_quantity_per_order   int64  
 6   mean_rrp_per_article       float64
 7   total_orders_per_customer  int64  
 8   colorCode_1                int64  
 9   colorCode_2                int64  
 10  colorCode_3                int64  
 11  colorCode_4                int64  
 12  likelihood_of_returning    float64
 13  productGroup_13.0          float64
 14  productGroup_14.0          float64
 15  productGroup_15.0          float64
 16  productGroup_17.0          float64
 17  productGroup_2.0           float64
 18  productGroup_26.0          float64
 19  productGroup_3.0           float64
 20  pr

In [None]:
#train_data = final.drop(columns="likelihood_of_returning")

In [None]:
y = final.pop('returnQuantity')
X = final

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size = 0.25, random_state = 00000)
print( f"shape of X_train, y_train, X_test, y_test: {X_train.shape}, {y_train.shape},{X_test.shape},{y_test.shape}")

shape of X_train, y_train, X_test, y_test: (1708476, 42), (1708476,),(569492, 42),(569492,)


In [None]:
from sklearn.preprocessing import StandardScaler

# Define the columns you want to normalize
columns_to_normalize = ['quantity', 'rrp', 'voucherAmount','total_quantity_per_order','mean_rrp_per_article','total_orders_per_customer']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])


In [None]:
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred_test = model.predict(X_test)

# Evaluate the model for MSE
mse_test = mean_squared_error(y_test, y_pred_test)
print('Testing MSE:', mse_test)

# Evaluate the model for MAE
mae_test = mean_absolute_error(y_test, y_pred_test)
print('Testing MAE:', mae_test)


Testing MSE: 0.16548555317595007
Testing MAE: 0.33556987212502104


# Model According to the report


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from xgboost import XGBClassifier


# Base models
# Regularized Logistic Regression
lr = LogisticRegression(fit_intercept=True)
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# XGBoost
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)

# Predict probabilities
lr_probs = lr.predict_proba(X_test)[:, 1]
rf_probs = rf.predict_proba(X_test)[:, 1]
gb_probs = gb.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]

# Stack predictions
stacked_features = np.column_stack((lr_probs, rf_probs, gb_probs, xgb_probs))

# Second layer model (using XGBoost for stacking)
second_layer_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
second_layer_model.fit(stacked_features, y_test)

# Predict final output
final_predictions = second_layer_model.predict_proba(stacked_features)[:, 1]

# Calculate log loss
final_log_loss = log_loss(y_test, final_predictions)
final_mae = mean_absolute_error(y_test, final_predictions)
final_mse = mean_squared_error(y_test, final_predictions)
print(f'Mean Absolute Error: {final_mae}')
print(f'Mean Squared Error: {final_mse}')
print(f'Final Log Loss: {final_log_loss}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
