In [None]:
%run insta_feature_engineering.py
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, f1_score, classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# enable garbage collector to aid in memory 
gc.enable()
# eliminate future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# mean encoding of aisle and department
aisle_mean = (df_train.groupby('aisle_id')['reordered'].mean()).to_frame('aisle').reset_index()
department_mean = (df_train.groupby('department_id')['reordered'].mean()).to_frame('department').reset_index()

In [None]:
# merging new features on test set
df_test = df_test.merge(aisle_mean, on = 'aisle_id', how = 'left')
df_test = df_test.merge(department_mean, on = 'department_id', how = 'left')

In [None]:
# merging new features on training set
df_train = df_train.merge(aisle_mean, on = 'aisle_id', how = 'left')
df_train = df_train.merge(department_mean, on = 'department_id', how = 'left')

In [None]:
# no longer need the actual id's in test or train set
df_test.drop(['aisle_id', 'department_id'], axis = 1, inplace = True)
df_train.drop(['aisle_id', 'department_id'], axis = 1, inplace = True)
df_test.set_index(['user_id', 'product_id'], inplace = True)
df_train.set_index(['user_id', 'product_id'], inplace = True)

In [None]:
# ensuring columns are the same, train has an extra due to reordered column 
df_test.shape, df_train.shape

In [None]:
del aisle_mean, department_mean
gc.collect()

In [None]:
# checking covariance on the features and will run both models with and without 
# last_five_up and ratio_last_five_up as it appears covariant with other features
sns.pairplot(df_train.head(10_000))

# X and y Variables for models

In [None]:
# These will be used as the base for X and y variables throughout the notebook
X, y = df_train.drop('reordered', axis=1), df_train.reordered

In [None]:
del df_train
gc.collect()

# Balancing the target class SMOTE

In [None]:
# balance data for better results from sampling 
smote = SMOTE(random_state = 13)
smote_X, smote_y = smote.fit_sample(X, y)
smote_X = pd.DataFrame(smote_X, columns = X.columns )
smote_y= pd.DataFrame(smote_y, columns=['reordered'])
# we can Check the numbers of our data
print("length is",len(smote_X))
print("Number of  not reordered",len(smote_y[smote_y['reordered'] == 0]))
print("Number of reordered",len(smote_y[smote_y['reordered'] == 1]))
print("Proportion of not reordered ", len(smote_y[smote_y['reordered'] == 0]) / len(smote_X))
print("Proportion of reordered", len(smote_y[smote_y['reordered'] == 1]) / len(smote_X))
# class is balanced with equal proportions

# Logistic Regression

In [None]:
# splitting for cross validation
X_train, X_test, y_train, y_test = train_test_split(smote_X, smote_y, test_size = .2, random_state = 13)

In [None]:
# standard scaler used to keep variation
scaler = StandardScaler()

In [None]:
# data scaled for better results and convergence
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# used to check feature importance again and coefficients
import statsmodels.api as sm
logit_model=sm.Logit(y_train, X_train)
result=logit_model.fit(maxiter = 200)
print(result.summary2())
# statsmodel failed to converge due to covariance issue with both features, confirms deleting
# results summary shows last_five and last_five ratio are to be rejected as the p value is greater than 5%
# most_hour just meets the threshold and will fail to be rejected

In [None]:
# instantiate model for grid search
lr1 = LogisticRegression(random_state = 13)

In [None]:
# dictionary of different hyperparameters to use during the gridsearch
params = {'C': [100_000, 1_000_000, 10_000_000], 
       'penalty': ['l1', 'l2'], 
       }

In [None]:
# instantiate gridsearch to run
lr1_cv = GridSearchCV(lr1, params, cv = 5, verbose = 1)

In [None]:
# runs the search to extract the best hyperparamters to use
lr1_cv.fit(X_train, np.ravel(y_train))

In [None]:
# prints the best parameters, will decide based on given best parameters
print("tuned hpyerparameters :(best parameters) ", lr1_cv.best_params_)
print("accuracy :", lr1_cv.best_score_)

# Run tuned Logit

In [None]:
# dropping insignificant features who were found to have zero importance and covariance
important_X = smote_X.drop(['last_five_up', 'ratio_last_five_up'], axis = 1)

In [None]:
# splitting for cross validation with dropped columns
X_train, X_test, y_train, y_test = train_test_split(important_X, smote_y, test_size = .2, random_state = 13)

In [None]:
# data scaled for better results and convergence
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# instantiate a newly configured model
tune_lr = LogisticRegression(random_state = 13, class_weight = 'balanced', C = 1_000_000)

In [None]:
# train the newly configured model
tune_lr.fit(X_train, np.ravel(y_train))

In [None]:
# predictions made to validate
tune_pred = tune_lr.predict(X_test)
tune_prob = tune_lr.predict_proba(X_test)

In [None]:
# ensuring a good distribution 
tune_prob.min(), tune_prob.max(), tune_prob.mean()

In [None]:
# checking with histogram for a good distribution of log odds
plt.hist(tune_prob);
plt.title('Probability Distribution')
plt.xlabel('Probability')
plt.ylabel('Frequency')

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Reds):
    """Plots confusion matrix in red colormap"""
    plt.imshow(cm, interpolation='nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
# prints confusion matrix and precision and recall 
cm1 = confusion_matrix(y_test, tune_pred)

print(cm1)
print('\n')
print("Precision: %0.2f" %(cm1[1, 1] / (cm1[1, 1] + cm1[0, 1])))
print("Recall:    %0.2f"% (cm1[1, 1] / (cm1[1, 1] + cm1[1, 0])))
    
cm2 = confusion_matrix(y_test, tune_pred, labels=[0, 1])
    
plt.figure()
plot_confusion_matrix(cm2)

In [None]:
# tune model classification report, another way of visualizing metrics
print(classification_report(y_test, tune_pred))

In [None]:
# Plotting the tuned model to visualize the auc, roc
fpr, tpr,_ = roc_curve(y_test, tune_pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot([0,1],[0,1], linestyle='--', color = 'black')
plt.plot(fpr, tpr, color = 'green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.gca().set_aspect('equal', adjustable='box')

# creating the submission file 

In [None]:
# ensuring bad features are removed from test set before training also ensures equal amount of columns
log_test = df_test.drop(['last_five_up', 'ratio_last_five_up'], axis = 1)

In [None]:
# binary classification with threshold of .5
logit_final_pred = tune_lr.predict(log_test)

In [None]:
# log odds from model
logit_final_prob = tune_lr.predict_proba(log_test)

In [None]:
# input into df to save results
df_test['logit_predicted'] = logit_final_pred.astype('uint8')
df_test['logit_probability'] = logit_final_prob[:,1]

In [None]:
# create df with predicted and probability values, order size will help calculate size of the predicted order 
lr_fin = df_test.reset_index()
lr_fin = lr_fin[['user_id', 'product_id', 'logit_predicted', 'logit_probability']]
lr_fin.info()

In [None]:
# regain order id for test set
orders_test = orders.loc[orders.eval_set == 'test', ['user_id', 'order_id']]

In [None]:
# final df to calculate submission file
lr_fin = lr_fin.merge(orders_test, on = 'user_id', how = 'left')
lr_fin.head()

In [None]:
# threshold has been tuned increased to .7 and decreased below 0.5; 0.58 delivers the best F1 score
d = dict()
for row in lr_fin.itertuples():
    if row.logit_probability >= 0.58:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in lr_fin.order_id:
    if order not in d:
        d[order] = 'None'
        
# inspect dictionary 
d

In [None]:
# Convert the dictionary into a DataFrame
sub = pd.DataFrame.from_dict(d, orient='index')

# Reset index
sub.reset_index(inplace=True)
# Set column names
sub.columns = ['order_id', 'products']

sub.head()

In [None]:
sub.to_csv('logitfinal_submission.csv', index=False, header=True)