# Model Training

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [2]:
# Apply settings
pd.set_option('Display.max_columns', None)

In [3]:
data = pd.read_csv(r'Amazon_cleaned_data.csv')
data.head()

Unnamed: 0,Product,Category,Price,Quantity,Total Sales,Customer Location,Payment Method,Status
0,Running Shoes,Footwear,60,3,180,New York,Debit Card,Cancelled
1,Headphones,Electronics,100,4,400,San Francisco,Debit Card,Pending
2,Running Shoes,Footwear,60,2,120,Denver,Amazon Pay,Cancelled
3,Running Shoes,Footwear,60,3,180,Dallas,Credit Card,Pending
4,Smartwatch,Electronics,150,3,450,New York,Debit Card,Pending


In [15]:
# Backing up the data
df = data.copy()

In [16]:
df.tail()

Unnamed: 0,Product,Category,Price,Quantity,Total Sales,Customer Location,Payment Method,Status
245,T-Shirt,Clothing,20,2,40,Miami,Debit Card,Cancelled
246,Jeans,Clothing,40,1,40,Dallas,Debit Card,Cancelled
247,T-Shirt,Clothing,20,2,40,Denver,Debit Card,Cancelled
248,Smartwatch,Electronics,150,3,450,New York,Debit Card,Cancelled
249,Smartphone,Electronics,500,4,2000,Seattle,Amazon Pay,Completed


In [17]:
# Encode Target columns
df['Status'] = df['Status'].map({'Cancelled': 0, 'Pending':1, 'Completed': 2})
df['Status'].value_counts()

Status
2    88
1    85
0    77
Name: count, dtype: int64

In [18]:
# Seperate Features and targets
X = df.drop(columns='Status', axis=1)
y = df['Status']

In [19]:
# Numeric and Categoric Columns
num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(include='object').columns.to_list()

print('The Numeric Columns are: \n', num_cols)
print('The Categoric Columns are: \n', cat_cols)

The Numeric Columns are: 
 ['Price', 'Quantity', 'Total Sales']
The Categoric Columns are: 
 ['Product', 'Category', 'Customer Location', 'Payment Method']


In [20]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
# Proccessing the data
num_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output= False))  # sparse_ouput = False is to get dense matrix due to Machine Learning
])

# Combine Steps
preproccessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Training on different models

In [22]:
# Models
model_dict= {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder= False, eval_metric= 'logloss')
}

# Hyperparameter
# Generic Search Space
search_space= {
    'C': [0.1,1,10],
    'kernal': ['linear', 'rgf'],
    'n_estimators': [50,100,200],
    'max_depth': [None,5,10],
    'learning_rate': [0.5,1]
}

# Function to filter hyperparameter
def filter_hyperparameter(model,space):
    valid_keys= model.get_params().keys()
    return {k:v for k, v in space.items() if k in valid_keys}

In [23]:
# Grid Search for each model
result = []
best_pipeline = {}

for name, model in model_dict.items():
    print(f'Tuning {model}...')
    pipe = Pipeline(steps = [
        ('proccessor', preproccessor),
        ('model', model)
    ])
    hyperparameter = filter_hyperparameter(model, search_space)
    # Prefix model name
    param_grid = {f'model__{k}':v for k,v in hyperparameter.items()}
    grid = GridSearchCV(estimator= pipe, param_grid=param_grid, cv =5, scoring= 'accuracy', n_jobs=-1)
    grid.fit(X_train,y_train)

    y_pred = grid.predict(X_test)
    report = metrics.classification_report(y_test,y_pred, output_dict= True) # output_dict = True because the result will come in dictionary
    
    result.append({
        'model_name': name, 
        'best_parameters': param_grid,
        'accuracy': round(metrics.accuracy_score(y_test,y_pred), 4),
        'f1-score': round(report['weighted avg']['f1-score'], 4)
    })

    best_pipeline[name] = grid.best_estimator_

Tuning LogisticRegression(max_iter=1000)...
Tuning SVC(probability=True)...
Tuning RandomForestClassifier()...
Tuning AdaBoostClassifier()...
Tuning XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)...


In [24]:
print(result)

[{'model_name': 'LogisticRegression', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.36, 'f1-score': 0.3593}, {'model_name': 'SVM', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.3, 'f1-score': 0.301}, {'model_name': 'RandomForestClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10]}, 'accuracy': 0.38, 'f1-score': 0.377}, {'model_name': 'AdaBoostClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.42, 'f1-score': 0.4157}, {'model_name': 'XGBoost', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.38, 'f1-score': 0.3813}]


In [25]:
# Compare the models
result_df = pd.DataFrame(result)
sorted_result_df = result_df.sort_values(by = 'accuracy', ascending = False)
print('\nModel Comparison: \n', sorted_result_df)

# Best Model
best_row = sorted_result_df.iloc[0]
best_model = best_row['model_name']
print('\nBest Model:', best_model)
print('\nBest Hyperparameter:', best_row['best_parameters'])


Model Comparison: 
                model_name                                    best_parameters  \
3      AdaBoostClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
4                 XGBoost  {'model__n_estimators': [50, 100, 200], 'model...   
2  RandomForestClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
0      LogisticRegression                         {'model__C': [0.1, 1, 10]}   
1                     SVM                         {'model__C': [0.1, 1, 10]}   

   accuracy  f1-score  
3      0.42    0.4157  
4      0.38    0.3813  
2      0.38    0.3770  
0      0.36    0.3593  
1      0.30    0.3010  

Best Model: AdaBoostClassifier

Best Hyperparameter: {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}


In [26]:
# Retraining best model on full dataset
final_pipeline = best_pipeline[best_model]
final_pipeline.fit(X,y)

# Save Pipeline
joblib.dump(final_pipeline, 'Status_pipeline.pkl')
print('The Status_pipeline.pkl is saved successfully')

The Status_pipeline.pkl is saved successfully
