In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('orders_train.txt', sep=';')
print('Number of records between {} to {}: {}'.format(df['orderDate'].iloc[0], df['orderDate'].iloc[-1], len(df)))
print('Number of attributes corresponding to a single record {}:'.format(df.shape[1]-1))
df.head()

Number of records between 2014-01-01 to 2015-09-30: 2325165
Number of attributes corresponding to a single record 14:


Unnamed: 0,orderID,orderDate,articleID,colorCode,sizeCode,productGroup,quantity,price,rrp,voucherID,voucherAmount,customerID,deviceID,paymentMethod,returnQuantity
0,a1000001,2014-01-01,i1000382,1972,44,3.0,1,10.0,29.99,0,0.0,c1010575,2,BPRG,0
1,a1000001,2014-01-01,i1000550,3854,44,3.0,1,20.0,39.99,0,0.0,c1010575,2,BPRG,0
2,a1000002,2014-01-01,i1001991,2974,38,8.0,1,35.0,49.99,0,0.0,c1045905,4,BPRG,0
3,a1000002,2014-01-01,i1001999,1992,38,8.0,1,49.99,49.99,0,0.0,c1045905,4,BPRG,1
4,a1000003,2014-01-01,i1001942,1968,42,8.0,1,10.0,35.99,0,0.0,c1089295,2,PAYPALVC,0


In [None]:
df.columns

In [None]:
#check NA values
print(df.isna().sum())
#drop NA values since not a significant number of records contain NA
df = df.dropna()

## inconsistent records
drop_idx = df[(df['quantity']==0) |  (df['price']==0) | (df['quantity']<df['returnQuantity']) ].index
df = df.drop(drop_idx)
df.shape


orderID             0
orderDate           0
articleID           0
colorCode           0
sizeCode            0
productGroup      351
quantity            0
price               0
rrp               351
voucherID           6
voucherAmount       0
customerID          0
deviceID            0
paymentMethod       0
returnQuantity      0
dtype: int64


(2277968, 15)

In [None]:
# Aggregating total quantity per order
df['total_quantity_per_order'] = df.groupby('orderID')['quantity'].transform('sum')

# Mean recommended retail price per article
df['mean_rrp_per_article'] = df.groupby('articleID')['rrp'].transform('mean')

# Total number of orders per customer
df['total_orders_per_customer'] = df.groupby('customerID')['orderID'].transform('nunique')


# Decompose colorCode into individual digits (assuming all colorCodes are four digits)
df['colorCode_1'] = df['colorCode'].apply(lambda x: int(str(x)[0]))
df['colorCode_2'] = df['colorCode'].apply(lambda x: int(str(x)[1]))
df['colorCode_3'] = df['colorCode'].apply(lambda x: int(str(x)[2]))
df['colorCode_4'] = df['colorCode'].apply(lambda x: int(str(x)[3]))

# Customer likelihood of returning items
df['likelihood_of_returning'] = df.groupby('customerID')['returnQuantity'].transform('sum') / df.groupby('customerID')['quantity'].transform('sum')

# Price level categorization based on rrp
bins = [0, 20, 50, 100, float('inf')]  # Define bins for price levels
labels = ['cheap', 'regular', 'expensive', 'luxury']
df['price_level'] = pd.cut(df['rrp'], bins=bins, labels=labels, include_lowest=True)


In [None]:
df[['productGroup', 'deviceID']] = df[['productGroup', 'deviceID']].astype(str)

In [None]:
df['price_level'] = df['price_level'].astype(str)

In [None]:
# Define columns for one-hot encoding
OHE_feats = ['productGroup', 'deviceID', 'paymentMethod','price_level']

# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(df[OHE_feats])

# Create DataFrame with one-hot encoded features
df_OHE = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(OHE_feats))

# Reset indices of the original data and df_OHE to ensure they are aligned
df.reset_index(drop=True, inplace=True)
df_OHE.reset_index(drop=True, inplace=True)

# Drop the original categorical columns from 'data'
df.drop(columns=OHE_feats, inplace=True)

# Concatenate the one-hot encoded DataFrame columns to the original DataFrame
data = pd.concat([df, df_OHE], axis=1)



In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform 'sizeCode' column
data['sizeCode'] = label_encoder.fit_transform(data['sizeCode'])

# Check the mapping of original categories to encoded labels
print("Mapping of original categories to encoded labels:")
for original_category, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{original_category} -> {encoded_label}")

# Print the encoded 'sizeCode' column
print("\nEncoded 'sizeCode' column:")
print(data['sizeCode'])

Mapping of original categories to encoded labels:
100 -> 0
24 -> 1
25 -> 2
26 -> 3
27 -> 4
28 -> 5
29 -> 6
30 -> 7
31 -> 8
32 -> 9
33 -> 10
34 -> 11
36 -> 12
38 -> 13
40 -> 14
42 -> 15
44 -> 16
75 -> 17
80 -> 18
85 -> 19
90 -> 20
95 -> 21
A -> 22
I -> 23
L -> 24
M -> 25
S -> 26
XL -> 27
XS -> 28

Encoded 'sizeCode' column:
0          16
1          16
2          13
3          13
4          15
           ..
2277963    14
2277964    15
2277965    13
2277966    13
2277967    13
Name: sizeCode, Length: 2277968, dtype: int64


In [None]:
final = data.drop(columns={'orderID', 'voucherID', 'orderDate', 'customerID', 'price', 'articleID', 'colorCode'})
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277968 entries, 0 to 2277967
Data columns (total 43 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   sizeCode                   int64  
 1   quantity                   int64  
 2   rrp                        float64
 3   voucherAmount              float64
 4   returnQuantity             int64  
 5   total_quantity_per_order   int64  
 6   mean_rrp_per_article       float64
 7   total_orders_per_customer  int64  
 8   colorCode_1                int64  
 9   colorCode_2                int64  
 10  colorCode_3                int64  
 11  colorCode_4                int64  
 12  likelihood_of_returning    float64
 13  productGroup_13.0          float64
 14  productGroup_14.0          float64
 15  productGroup_15.0          float64
 16  productGroup_17.0          float64
 17  productGroup_2.0           float64
 18  productGroup_26.0          float64
 19  productGroup_3.0           float64
 20  pr

In [None]:
final['returnLabel'] = (final['returnQuantity'] > 0).astype(int)

In [None]:
y = final.pop('returnLabel')
final.drop(columns='returnQuantity',inplace=True)
X = final
X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size = 0.25, random_state = 00000)
print( f"shape of X_train, y_train, X_test, y_test: {X_train.shape}, {y_train.shape},{X_test.shape},{y_test.shape}")

shape of X_train, y_train, X_test, y_test: (1708476, 42), (1708476,),(569492, 42),(569492,)


In [None]:
from sklearn.preprocessing import StandardScaler

# Define the columns you want to normalize
#columns_to_normalize = ['quantity', 'rrp', 'voucherAmount','total_quantity_per_order','mean_rrp_per_article','total_orders_per_customer']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report

y_pred_test = model.predict(X_test)

# Evaluate the model for MSE
mse_test = mean_squared_error(y_test, y_pred_test)
print('Testing MSE:', mse_test)

# Evaluate the model for MAE
mae_test = mean_absolute_error(y_test, y_pred_test)
print('Testing MAE:', mae_test)

# Print the classification report
report = classification_report(y_test, y_pred_test)
print('Classification Report:\n', report)


Testing MSE: 0.2508937790170889
Testing MAE: 0.2508937790170889
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72    268388
           1       0.74      0.81      0.77    301104

    accuracy                           0.75    569492
   macro avg       0.75      0.75      0.75    569492
weighted avg       0.75      0.75      0.75    569492



In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Average MSE:", -np.mean(scores))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Step 1: Create the model with a suitable solver for large datasets
model = LogisticRegression(solver='saga', max_iter=1000, random_state=42)  # Increased max_iter for convergence

# Step 2: Define the parameter grid with more extensive options
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Extended range on a logarithmic scale
    'penalty': ['l1', 'l2']                     # L1 and L2 regularization
}

# Step 3: Setup GridSearchCV with parallel processing
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)

# Step 4: Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Output the best parameters and the best model found
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)


Fitting 5 folds for each of 14 candidates, totalling 70 fits
Best parameters: {'C': 0.001, 'penalty': 'l2'}
Best estimator: LogisticRegression(C=0.001, max_iter=1000, random_state=42, solver='saga')


In [None]:
# Optionally, use the best estimator to make predictions
y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
# Evaluate the model for MSE
mse_test = mean_squared_error(y_test, y_pred)
print('Testing MSE:', mse_test)

# Evaluate the model for MAE
mae_test = mean_absolute_error(y_test, y_pred)
print('Testing MAE:', mae_test)

# Print the classification report
report = classification_report(y_test, y_pred)
print('Classification Report:\n', report)

Testing MSE: 0.25086743975332404
Testing MAE: 0.25086743975332404
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72    268388
           1       0.74      0.81      0.77    301104

    accuracy                           0.75    569492
   macro avg       0.75      0.75      0.75    569492
weighted avg       0.75      0.75      0.75    569492

