# Backorder Prediction

## Load libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold,train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import time

In [None]:
train_df = pd.read_csv("./dataset/Kaggle_Training_Dataset_v2.csv", low_memory=False)
test_df = pd.read_csv("./dataset/Kaggle_Test_Dataset_v2.csv", low_memory=False)

## Preprocessing

In [None]:
def process(df):
    
    df.drop('sku', axis=1, inplace=True)
    df.dropna(how='all', inplace=True)

    numerical_cols = [col for col in df.columns if df[col].dtype != 'object']
    categorical_cols = [col for col in df.columns if col not in numerical_cols]
    
    for col in df.columns:
        if col in 'lead_time':
            df[col] = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(df[col].values.reshape(-1, 1))
        if col in ['perf_6_month_avg', 'perf_12_month_avg']:
            df[col] = SimpleImputer(missing_values=-99, strategy='median').fit_transform(df[col].values.reshape(-1, 1))
        if col in categorical_cols:
            df[col] = (df[col] == 'Yes').astype(int)
    
    # Normalize attributes related to quantities
    norm_cols = [col for col in numerical_cols if col not in ['lead_time', 'perf_6_month_avg', 'perf_12_month_avg']]
    
    df[norm_cols] = normalize(df[norm_cols], axis=1)
    
    #df['lead_time'] = df['lead_time']/df['lead_time'].max().astype(np.float64)
    
    return df
    

In [None]:
data = process(train_df.append(test_df))

In [None]:
data.head()

In [None]:
data.shape

## Visualization

In [None]:
def plot(X, y, title=''):
    X_std = StandardScaler().fit_transform(X)
    dec = PCA(n_components=2)
    X_reduced = dec.fit_transform(X_std)
    
    f, ax = plt.subplots(figsize=(6,6))
    ax.scatter(X_reduced[y==0,0], X_reduced[y==0,1], alpha=0.5, 
               facecolors='none', edgecolors='cornflowerblue', label="Negative")
    ax.scatter(X_reduced[y==1,0], X_reduced[y==1,1], c='darkorange', marker='*', 
               label='Positive')
    plt.title("Explained Variance ratio: %.2f%%" % (100*dec.explained_variance_ratio_.sum()))
    ax.legend(loc='lower left')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    plt.show()

In [None]:
sample = data.sample(6000, random_state=56)
X_sample = sample.iloc[:, :-1].values
y_sample = sample.iloc[:, -1].values
plot(X_sample, y_sample)

## Split into train and test data

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=42)

## Standardize training set

In [None]:
scaler = StandardScaler().fit(X_train)
X_std = scaler.transform(X_train)

## Baseline model

In [None]:
lg = LogisticRegression()
lg.fit(X_std, y_train)

In [None]:
preds = lg.predict(X_test)
accuracy_score(y_test, preds)

## Model Selection

In [None]:
n_runs = 5
scorer = make_scorer(average_precision_score, needs_threshold=True, average='micro')
min_samples_leaf=5
n_estimators=10
criterion='entropy'
max_depth=np.arange(3,45,5)
max_depth=[3,4,5,7,10,15,20,30,50]
n_folds=5
estimators = [
#     ("lgst", LogisticRegression(), {'C': np.logspace(0, 3, 4), 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}),
    ('cart', DecisionTreeClassifier(min_samples_leaf=5, criterion='entropy'), {'max_depth': max_depth, 'criterion': ['gini', 'entropy']}),
    ('rus', Pipeline([('res', RandomUnderSampler()), ('tree', DecisionTreeClassifier(min_samples_leaf=5, criterion='entropy'))]), {'tree__max_depth': max_depth}),
    ('smt', Pipeline([('res', SMOTE()), ('tree', DecisionTreeClassifier(min_samples_leaf=5, criterion='entropy'))]), {'tree__max_depth': max_depth}),
    ('rf', RandomForestClassifier(n_estimators=10, min_samples_leaf=5, criterion='entropy'), {'max_depth': max_depth}),
    ('gb', GradientBoostingClassifier(n_estimators=10, min_samples_leaf=5), {'max_depth': max_depth})
]

In [None]:
for est_name, est, params in estimators:
    matrix = []
    t0 = time.time()
    
    for run in range(n_runs):
        X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, stratify=y, test_size=0.15, random_state=run)
#         scaler1 = StandardScaler().fit(X_train1)
#         X_std1 = scaler1.transform(X_train1)
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=int(run*9))
        gs = GridSearchCV(est, params, cv=kf, scoring=scorer, verbose=0, n_jobs=1)
        
        t1 = time.time()
        gs.fit(X_train1, y_train1)

        y_prob0 = gs.best_estimator_.predict_proba(X_train1)[:, 1]
        y_prob = gs.best_estimator_.predict_proba(X_test1)[:, 1]
        
        roc = roc_auc_score(y_test1, y_prob)
        pr = average_precision_score(y_test1, y_prob)
        
        run_time = time.time() - t1
        avg_time = run_time/gs.n_splits_
        
        print ("%i\t%s\t%.4f\t%.4f\t%.4f\t%.2f\t%s" % (run, est_name, 
            gs.best_score_, roc, pr, avg_time, gs.best_params_))

        
        # get importance
        imp = []
        model = gs.best_estimator_
    
        if est_name in ['rus','smt']:
            imp = model.named_steps['tree'].feature_importances_
        elif est_name == 'lgst':
            imp = model.coef_.ravel()
        else:
            imp = model.feature_importances_
        
        matrix.append(
        {   'run'           : run,
            'estimator'     : est_name,         
            'roc'           : roc,
            'pr'            : pr,
            'best_params'   : gs.best_params_, 
            'avg_time'      : avg_time,
            'importance'    : imp,
        })