# Cross Sale Analysis - Predicting New Products 

## 1. Importing Required Libraries

In [None]:
import pandas as pd # For Data Manipulation
import numpy as np # For Array Operations
import matplotlib.pyplot as plt # For Visualization Using Matplot
import seaborn as sns # For Visualization Using Seaborn
from sklearn.model_selection import train_test_split # For Spliiting Dataset Into Test And Training
from sklearn.preprocessing import StandardScaler # For Scaling
from sklearn.naive_bayes import MultinomialNB # For creating a multinomial naive bayes model
from sklearn.naive_bayes import ComplementNB # For creating a compliment naive bayes model
from sklearn.naive_bayes import GaussianNB # For creating a gaussian naive bayes model
from sklearn.naive_bayes import BernoulliNB # For creating a bernoulli naive bayes model
from sklearn.linear_model import LogisticRegression, SGDClassifier # For creating a logistic regression and a stochastic gradient descriptionent model
from sklearn.svm import LinearSVC # For creating a svc model
from sklearn.tree import DecisionTreeClassifier # For creating a decision tree model
from sklearn.ensemble import RandomForestClassifier # For creating a random forest tree model
from sklearn.neighbors import KNeighborsClassifier # For knn
from sklearn.preprocessing import MinMaxScaler # For scaling
from sklearn.pipeline import Pipeline # For creating pipeline
from sklearn.feature_selection import RFE # For feature selection
from sklearn import metrics # For performance evaluation
from sklearn.tree import DecisionTreeClassifier # For Decision Tree
from sklearn.ensemble import RandomForestClassifier # For Random Forest
from sklearn.model_selection import GridSearchCV # For Cross Validation
from sklearn.metrics import accuracy_score, f1_score, auc # For Accuracy Score
from sklearn.metrics import confusion_matrix # For True and False Positive Values
from datetime import datetime # For Datetime Conversion
from sklearn.preprocessing import OneHotEncoder # For handling categorical variables using one hot encoder
from xgboost import XGBClassifier # For xg boost model
from numpy import set_printoptions # For numpy array operations
from sklearn.feature_selection import SelectKBest # For feature selection
from sklearn.feature_selection import f_classif # For feature classification
import pyodbc # For Making A SQL Connection
import heapq # For selecting maximum from a list
import warnings # For Disabling Warnings
warnings.filterwarnings('ignore')

## 2. Data Wrangling

In [None]:
#Extracting the data based on the columns that we need
data = pd.read_csv("cp_data.csv", index_col=False)
#data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data

In [None]:
data = data.drop('index', 1)

In [None]:
# Checking for null values
data.isna().sum()

In [None]:
len(data.columns)

In [None]:
full_data = data

In [None]:
count_classes = pd.value_counts(data['target'], sort=True)
count_classes.plot(kind='bar',rot=0)
plt.title("Target Class Distribution")
plt.xlabel("Class")
plt.ylabel("Records")

In [None]:
import statsmodels.api as sm
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

In [None]:
categorical_data = full_data.select_dtypes(exclude=["number","bool"])
print("Numerical Columns: ", len(full_data.columns)-len(categorical_data.columns))
numerical = len(full_data.columns)-len(categorical_data.columns)
print("Categorical Columns: ", len(full_data.columns)-numerical)

In [None]:
categorical_data.columns

In [None]:
fe_1 = full_data.groupby('cp__risk_emp_name').size()/len(full_data)
full_data.loc[:, 'cp__risk_emp_name'] = full_data['cp__risk_emp_name'].map(fe_1)
full_data[['cp__risk_emp_name']]

In [None]:
fe_2 = full_data.groupby('cp__occup_pos_description').size()/len(full_data)
full_data.loc[:, 'cp__occup_pos_description'] = full_data['cp__occup_pos_description'].map(fe_2)
full_data[['cp__occup_pos_description']]

In [None]:
fe_3 = full_data.groupby('cp__eco_sector_description').size()/len(full_data)
full_data.loc[:, 'cp__eco_sector_description'] = full_data['cp__eco_sector_description'].map(fe_3)
full_data[['cp__eco_sector_description']]

In [None]:
fe_4 = full_data.groupby('cp__sub_eco_sector_description').size()/len(full_data)
full_data.loc[:, 'cp__sub_eco_sector_description'] = full_data['cp__sub_eco_sector_description'].map(fe_4)
full_data[['cp__sub_eco_sector_description']]

In [None]:
fe_5 = full_data.groupby('cp__customer_id_type_name').size()/len(full_data)
full_data.loc[:, 'cp__customer_id_type_name'] = full_data['cp__customer_id_type_name'].map(fe_5)
full_data[['cp__customer_id_type_name']]

In [None]:
fe_6 = full_data.groupby('cp__priority_code_description').size()/len(full_data)
full_data.loc[:, 'cp__priority_code_description'] = full_data['cp__priority_code_description'].map(fe_6)
full_data[['cp__priority_code_description']]

In [None]:
fe_7 = full_data.groupby('cp__language').size()/len(full_data)
full_data.loc[:, 'cp__language'] = full_data['cp__language'].map(fe_7)
full_data[['cp__language']]

In [None]:
fe_8 = full_data.groupby('cp__language_name').size()/len(full_data)
full_data.loc[:, 'cp__language_name'] = full_data['cp__language_name'].map(fe_8)
full_data[['cp__language_name']]

In [None]:
fe_9 = full_data.groupby('cp__age_bracket').size()/len(full_data)
full_data.loc[:, 'cp__age_bracket'] = full_data['cp__age_bracket'].map(fe_9)
full_data[['cp__age_bracket']]

In [None]:
fe_10 = full_data.groupby('cp__branch_description').size()/len(full_data)
full_data.loc[:, 'cp__branch_description'] = full_data['cp__branch_description'].map(fe_10)
full_data[['cp__branch_description']]

In [None]:
fe_11 = full_data.groupby('cp__customer_id_credit_level__pf_past_12_month_level').size()/len(full_data)
full_data.loc[:, 'cp__customer_id_credit_level__pf_past_12_month_level'] = full_data['cp__customer_id_credit_level__pf_past_12_month_level'].map(fe_11)
full_data[['cp__customer_id_credit_level__pf_past_12_month_level']]

In [None]:
fe_12 = full_data.groupby('cp__customer_id_credit_level__cc_past_12_month_level').size()/len(full_data)
full_data.loc[:, 'cp__customer_id_credit_level__cc_past_12_month_level'] = full_data['cp__customer_id_credit_level__cc_past_12_month_level'].map(fe_12)
full_data[['cp__customer_id_credit_level__cc_past_12_month_level']]

In [None]:
fe_13 = full_data.groupby('cp__customer_id_credit_level__past_12_month_level').size()/len(full_data)
full_data.loc[:, 'cp__customer_id_credit_level__past_12_month_level'] = full_data['cp__customer_id_credit_level__past_12_month_level'].map(fe_13)
full_data[['cp__customer_id_credit_level__past_12_month_level']]

In [None]:
fe_14 = full_data.groupby('cp_act__min_other_cp_act_date_open').size()/len(full_data)
full_data.loc[:, 'cp_act__min_other_cp_act_date_open'] = full_data['cp_act__min_other_cp_act_date_open'].map(fe_14)
full_data[['cp_act__min_other_cp_act_date_open']]

In [None]:
fe_15 = full_data.groupby('cp_act__max_other_cp_act_date_open').size()/len(full_data)
full_data.loc[:, 'cp_act__max_other_cp_act_date_open'] = full_data['cp_act__max_other_cp_act_date_open'].map(fe_15)
full_data[['cp_act__max_other_cp_act_date_open']]

In [None]:
fe_16 = full_data.groupby('remaining__cp_bank_other').size()/len(full_data)
full_data.loc[:, 'remaining__cp_bank_other'] = full_data['remaining__cp_bank_other'].map(fe_16)
full_data[['remaining__cp_bank_other']]

In [None]:
fe_17 = full_data.groupby('remaining__cp_capital_investment_fund').size()/len(full_data)
full_data.loc[:, 'remaining__cp_capital_investment_fund'] = full_data['remaining__cp_capital_investment_fund'].map(fe_17)
full_data[['remaining__cp_capital_investment_fund']]

In [None]:
fe_18 = full_data.groupby('remaining__cp_capital_total').size()/len(full_data)
full_data.loc[:, 'remaining__cp_capital_total'] = full_data['remaining__cp_capital_total'].map(fe_18)
full_data[['remaining__cp_capital_total']]

In [None]:
fe_19 = full_data.groupby('cp__all_group_description_debit').size()/len(full_data)
full_data.loc[:, 'cp__all_group_description_debit'] = full_data['cp__all_group_description_debit'].map(fe_19)
full_data[['cp__all_group_description_debit']]

In [None]:
fe_20 = full_data.groupby('cp__all_group_description_credit').size()/len(full_data)
full_data.loc[:, 'cp__all_group_description_credit'] = full_data['cp__all_group_description_credit'].map(fe_20)
full_data[['cp__all_group_description_credit']]

In [None]:
fe_21 = full_data.groupby('cp__retiree_group_description_debit').size()/len(full_data)
full_data.loc[:, 'cp__retiree_group_description_debit'] = full_data['cp__retiree_group_description_debit'].map(fe_21)
full_data[['cp__retiree_group_description_debit']]

In [None]:
fe_22 = full_data.groupby('cp__retiree_group_description_credit').size()/len(full_data)
full_data.loc[:, 'cp__retiree_group_description_credit'] = full_data['cp__retiree_group_description_credit'].map(fe_22)
full_data[['cp__retiree_group_description_credit']]

In [None]:
fe_23 = full_data.groupby('cp__all_debit_count_consumption_sector').size()/len(full_data)
full_data.loc[:, 'cp__all_debit_count_consumption_sector'] = full_data['cp__all_debit_count_consumption_sector'].map(fe_23)
full_data[['cp__all_debit_count_consumption_sector']]

In [None]:
fe_24 = full_data.groupby('cp__dc_count_preference_sector').size()/len(full_data)
full_data.loc[:, 'cp__dc_count_preference_sector'] = full_data['cp__dc_count_preference_sector'].map(fe_24)
full_data[['cp__dc_count_preference_sector']]

In [None]:
fe_25 = full_data.groupby('cp_ibmb__eop_month').size()/len(full_data)
full_data.loc[:, 'cp_ibmb__eop_month'] = full_data['cp_ibmb__eop_month'].map(fe_25)
full_data[['cp_ibmb__eop_month']]

In [None]:
fe_26 = full_data.groupby('cp_ibmb__past_visit_device').size()/len(full_data)
full_data.loc[:, 'cp_ibmb__past_visit_device'] = full_data['cp_ibmb__past_visit_device'].map(fe_26)
full_data[['cp_ibmb__past_visit_device']]

In [None]:
fe_27 = full_data.groupby('cp_ibmb__login_in_past_90_days_past_login_date').size()/len(full_data)
full_data.loc[:, 'cp_ibmb__login_in_past_90_days_past_login_date'] = full_data['cp_ibmb__login_in_past_90_days_past_login_date'].map(fe_27)
full_data[['cp_ibmb__login_in_past_90_days_past_login_date']]

In [None]:
fe_28 = full_data.groupby('cp_cc__item_take_l1').size()/len(full_data)
full_data.loc[:, 'cp_cc__item_take_l1'] = full_data['cp_cc__item_take_l1'].map(fe_28)
full_data[['cp_cc__item_take_l1']]

In [None]:
fe_29 = full_data.groupby('cp_pc__item_take_l1').size()/len(full_data)
full_data.loc[:, 'cp_pc__item_take_l1'] = full_data['cp_pc__item_take_l1'].map(fe_29)
full_data[['cp_pc__item_take_l1']]

In [None]:
fe_30 = full_data.groupby('cp_pf__item_take_l1').size()/len(full_data)
full_data.loc[:, 'cp_pf__item_take_l1'] = full_data['cp_pf__item_take_l1'].map(fe_30)
full_data[['cp_pf__item_take_l1']]

In [None]:
fe_31 = full_data.groupby('cp_fd__item_take_l1').size()/len(full_data)
full_data.loc[:, 'cp_fd__item_take_l1'] = full_data['cp_fd__item_take_l1'].map(fe_31)
full_data[['cp_fd__item_take_l1']]

In [None]:
fe_32 = full_data.groupby('cp_sa__item_take_l1').size()/len(full_data)
full_data.loc[:, 'cp_sa__item_take_l1'] = full_data['cp_sa__item_take_l1'].map(fe_32)
full_data[['cp_sa__item_take_l1']]

In [None]:
fe_33 = full_data.groupby('cp_cc__item_take_l2').size()/len(full_data)
full_data.loc[:, 'cp_cc__item_take_l2'] = full_data['cp_cc__item_take_l2'].map(fe_33)
full_data[['cp_cc__item_take_l2']]

In [None]:
fe_34 = full_data.groupby('cp_pc__item_take_l2').size()/len(full_data)
full_data.loc[:, 'cp_pc__item_take_l2'] = full_data['cp_pc__item_take_l2'].map(fe_34)
full_data[['cp_pc__item_take_l2']]

In [None]:
fe_35 = full_data.groupby('cp_pf__item_take_l2').size()/len(full_data)
full_data.loc[:, 'cp_pf__item_take_l2'] = full_data['cp_pf__item_take_l2'].map(fe_35)
full_data[['cp_pf__item_take_l2']]

In [None]:
fe_36 = full_data.groupby('cp_fd__item_take_l2').size()/len(full_data)
full_data.loc[:, 'cp_fd__item_take_l2'] = full_data['cp_fd__item_take_l2'].map(fe_36)
full_data[['cp_fd__item_take_l2']]

In [None]:
fe_37 = full_data.groupby('cp_sa__item_take_l2').size()/len(full_data)
full_data.loc[:, 'cp_sa__item_take_l2'] = full_data['cp_sa__item_take_l2'].map(fe_37)
full_data[['cp_sa__item_take_l2']]

In [None]:
categorical_data = full_data.select_dtypes(exclude=["number","bool"])
print("Numerical Columns: ", len(full_data.columns)-len(categorical_data.columns))
numerical = len(full_data.columns)-len(categorical_data.columns)
print("Categorical Columns: ", len(full_data.columns)-numerical)

In [None]:
full_data

In [None]:
correlation_data = full_data.loc[:,full_data.columns !='customer_id']
correlation_matrix = correlation_data.corr().abs()

In [None]:
upper_triangular = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(np.bool))
upper_triangular

In [None]:
columns_to_drop_great = [column for column in upper_triangular.columns if any(upper_triangular[column]>0.45)]
columns_to_drop_less = [column for column in upper_triangular.columns if any(upper_triangular[column]<=0.0)]
columns_to_drop = columns_to_drop_great + columns_to_drop_less
columns_to_drop_data = pd.DataFrame()
columns_to_drop_data['column'] = columns_to_drop
columns_to_drop_data.to_csv("columns_to_drop_correlation_matrix.csv")
print("Total columns to drop: ", len(columns_to_drop))
print("Total columns remaining", len(full_data.columns)-len(columns_to_drop))

In [None]:
columns_to_keep = []
for i in full_data.columns:
    if i not in columns_to_drop:
        columns_to_keep.append(i)
print("Total columns kept (including id): ", len(columns_to_keep))

In [None]:
reserved_data = full_data[columns_to_keep]
reserved_data

In [None]:
train = reserved_data.loc[:,reserved_data.columns !='target']
test = reserved_data.loc[:,reserved_data.columns =='target']

In [None]:
train

In [None]:
near_miss = NearMiss()
x_near,y_near = near_miss.fit_sample(train, test)

In [None]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(train, test)

In [None]:
ols = sm.OLS(x_near,y_near).fit()
print(ols.sumonthary())

In [None]:
select_features = SelectKBest(score_func=f_classif, k=13)
fit = select_features.fit(train, test)
# sumontharize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(train)
# sumontharize selected features
print(features[0:5,:])

In [None]:
cols = select_features.get_support(indices=True)
features_names = reserved_data.iloc[:,cols]
features_names

In [None]:
selected_data = reserved_data[features_names.columns]
selected_data['customer_id'] = reserved_data['customer_id']
selected_data['target'] = reserved_data['target']
selected_data

In [None]:
train = selected_data.loc[:,selected_data.columns !='target']
train = selected_data.loc[:,selected_data.columns !='customer_id']
test = selected_data.loc[:,selected_data.columns =='target']

In [None]:
selected_data

In [None]:
len(test)

In [None]:
train

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
rfecv = RFECV(estimator=dtree, step=1, scoring="neg_mean_squared_error", cv=4, verbose=1,n_jobs = 4)
rfecv.fit(train, test)
rfecv.transform(train)
print(rfecv)
print(rfecv.n_features_)

In [None]:
test_size = 0.8
X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=test_size, random_state=0)

In [None]:
train_id = X_test['customer_id']
X_train = X_train.loc[:,X_train.columns !='customer_id']
X_test = X_test.loc[:,X_test.columns !='customer_id']

In [None]:
#rfe = RFE(estimator=LogisticRegression(), step=1)
#rfe = rfe.fit(X_train, y_train)

In [None]:
#selected_rfe_features = pd.DataFrame({'column':list(X_train.columns), 'rank':list(rfe.ranking_)})
#selected_rfe_features.sort_values(by='rank')

In [None]:
#extract_rfe_features = selected_rfe_features.loc[selected_rfe_features['rank'] == 1]
#extract_rfe_features

In [None]:
#x_train_rfe = rfe.transform(X_train)
#x_test_rfe = rfe.transform(X_test)

In [None]:
logistic_regression_model = LogisticRegression(solver='liblinear', random_state=0)
logistic_regression_model.fit(X_train, y_train)
logistic_regression_predicted = logistic_regression_model.predict(X_test)
logistic_regression_probability = logistic_regression_model.predict_proba(X_test)
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predicted)
logistic_regression_model_accuracy = logistic_regression_model.score(X_test, y_test) 
logistic_regression_model_accuracy  = (round(logistic_regression_model_accuracy, 2)*100)
print("Logistic Regression Accuracy: %.2f%%" % (logistic_regression_accuracy * 100.0))
print("Model accuracy:", logistic_regression_model_accuracy, "%")

In [None]:
logistic_regression_probability_data = pd.DataFrame(logistic_regression_probability)
logistic_regression_probability_data

In [None]:
logistic_regression_data = pd.concat([X_test, y_test], axis=1)
logistic_regression_data = logistic_regression_data.reset_index(drop=True)
logistic_regression_data["predicted"] = logistic_regression_predicted
logistic_regression_data["probability_0"] = logistic_regression_probability_data[0]
logistic_regression_data["probability_1"] = logistic_regression_probability_data[1]
logistic_regression_data['decile_rank'] = pd.cut(logistic_regression_data['probability_0'], 10, labels=False)
logistic_regression_data['customer_id'] = len(list(train_id.values))
logistic_regression_data.to_csv("logistic_regression_data.csv")

In [None]:
logistic_regression_data.decile_rank.value_counts()

In [None]:
len(logistic_regression_data)

In [None]:
predicted_count = pd.DataFrame(logistic_regression_predicted.flatten())
predicted_count[0].value_counts()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(true_value, predicted_value, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(predicted_value), max(true_value))
p2 = min(min(predicted_value), min(true_value))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=20)
model = LogisticRegression()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
pipeline.fit(X_train, y_train)
pipeline_predicted = pipeline.predict(X_test)
pipeline_probability = pipeline.predict_proba(X_test)
pipeline_accuracy = accuracy_score(y_test, pipeline_predicted)
pipeline_model_accuracy = pipeline.score(X_test, y_test) 
pipeline_model_accuracy  = (round(pipeline_model_accuracy, 2)*100)
print("Logistic Regression Accuracy: %.2f%%" % (pipeline_accuracy * 100.0))
print("Model accuracy:", pipeline_model_accuracy, "%")

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator=XGBClassifier(), n_features_to_select=20)
model = XGBClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
pipeline.fit(X_train, y_train)
pipeline_predicted = pipeline.predict(X_test)
pipeline_probability = pipeline.predict_proba(X_test)
pipeline_accuracy = accuracy_score(y_test, pipeline_predicted)
pipeline_model_accuracy = pipeline.score(X_test, y_test) 
pipeline_model_accuracy  = (round(pipeline_model_accuracy, 2)*100)
print("Logistic Regression Accuracy: %.2f%%" % (pipeline_accuracy * 100.0))
print("Model accuracy:", pipeline_model_accuracy, "%")

In [None]:
reserved_data.target.value_counts()

In [None]:
predicted_count = pd.DataFrame(pipeline_predicted.flatten())
predicted_count[0].value_counts()

In [None]:
pipeline_probability_data = pd.DataFrame(pipeline_probability)
pipeline_probability_data

In [None]:
pipeline_data = pd.concat([X_test, y_test], axis=1)
pipeline_data = pipeline_data.reset_index(drop=True)
pipeline_data["predicted"] = pipeline_predicted
pipeline_data["probability_0"] = pipeline_probability_data[0]
pipeline_data["probability_1"] = pipeline_probability_data[1]
pipeline_data['decile_rank'] = pd.cut(pipeline_data['probability_0'], 10, labels=False)
pipeline_data['customer_id'] = len(list(train_id.values))
pipeline_data.to_csv("logistic_regression_data.csv")

In [None]:
pipeline_data.decile_rank.value_counts()

In [None]:
test_size = 0.8
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=test_size, random_state=0)

In [None]:
train_id = X_test['customer_id']
X_train = X_train.loc[:,X_train.columns !='customer_id']
X_test = X_test.loc[:,X_test.columns !='customer_id']

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_predicted = xgb_model.predict(X_test)
xgb_probability = xgb_model.predict_proba(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predicted)
xgb_model_accuracy = xgb_model.score(X_test, y_test) 
xgb_model_accuracy = (round(xgb_model_accuracy, 2)*100)
print("XG Boost Accuracy: %.2f%%" % (xgb_model_accuracy))
print("Model accuracy:", xgb_model_accuracy, "%")

In [None]:
xgb_probability_data = pd.DataFrame(xgb_probability)
xgb_probability_data

In [None]:
xgb_data = pd.concat([X_test, y_test], axis=1)
xgb_data = xgb_data.reset_index(drop=True)
xgb_data["predicted"] = xgb_predicted
xgb_data["probability_0"] = xgb_probability_data[0]
xgb_data["probability_1"] = xgb_probability_data[1]
xgb_data['decile_rank'] = pd.qcut(xgb_data['probability_0'], 10, labels=False)
xgb_data['customer_id'] = len(list(train_id.values))
xgb_data.to_csv("xgb_data.csv")

In [None]:
from sklearn.preprocessing import MinMaxScaler
naive_bayes_model = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
naive_bayes_model.fit(X_train, y_train)
naive_bayes_predicted = naive_bayes_model.predict(X_test)
naive_bayes_probability = naive_bayes_model.predict_proba(X_test)
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_predicted)
naive_bayes_model_accuracy = naive_bayes_model.score(X_test, y_test) 
naive_bayes_model_accuracy = (round(naive_bayes_model_accuracy, 2)*100)
print("Naive Bayes Accuracy: %.2f%%" % (naive_bayes_model_accuracy))
print("Model accuracy:", naive_bayes_accuracy, "%")

In [None]:
naive_bayes_probability_data = pd.DataFrame(naive_bayes_probability)
naive_bayes_probability_data

In [None]:
naive_bayes_data = pd.concat([X_test, y_test], axis=1)
naive_bayes_data = naive_bayes_data.reset_index(drop=True)
naive_bayes_data["predicted"] = naive_bayes_predicted
naive_bayes_data["probability_0"] = naive_bayes_probability_data[0]
naive_bayes_data["probability_1"] = naive_bayes_probability_data[1]
naive_bayes_data['decile_rank'] = pd.qcut(naive_bayes_data['probability_0'], 10, labels=False)
naive_bayes_data['customer_id'] = len(list(train_id.values))
naive_bayes_data.to_csv("naive_bayes_data.csv")

In [None]:
k_nearest_neighbor_model = Pipeline([('Normalizing',MinMaxScaler()),('KNeighborsClassifier',KNeighborsClassifier())])
k_nearest_neighbor_model.fit(X_train, y_train)
k_nearest_neighbor_predicted = k_nearest_neighbor_model.predict(X_test)
k_nearest_neighbor_probability = k_nearest_neighbor_model.predict_proba(X_test)
k_nearest_neighbor_accuracy = accuracy_score(y_test, k_nearest_neighbor_predicted)
k_nearest_neighbor_model_accuracy = k_nearest_neighbor_model.score(X_test, y_test) 
k_nearest_neighbor_model_accuracy = (round(k_nearest_neighbor_model_accuracy, 2)*100)
print("K Nearest Neighbor Accuracy: %.2f%%" % (k_nearest_neighbor_model_accuracy))
print("Model accuracy:", k_nearest_neighbor_accuracy, "%")

In [None]:
k_nearest_neighbor_probability_data = pd.DataFrame(k_nearest_neighbor_probability)
k_nearest_neighbor_probability_data

In [None]:
k_nearest_neighbor_data = pd.concat([X_test, y_test], axis=1)
k_nearest_neighbor_data = k_nearest_neighbor_data.reset_index(drop=True)
k_nearest_neighbor_data["predicted"] = k_nearest_neighbor_predicted
k_nearest_neighbor_data["probability_0"] = k_nearest_neighbor_probability_data[0]
k_nearest_neighbor_data["probability_1"] = k_nearest_neighbor_probability_data[1]
k_nearest_neighbor_data['decile_rank'] = pd.cut(k_nearest_neighbor_data['probability_0'], 10, labels=False)
k_nearest_neighbor_data['customer_id'] = len(list(train_id.values))
k_nearest_neighbor_data.to_csv("k_nearest_neighbor_data.csv")