In [None]:
# %% libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from io import StringIO 
from IPython.display import Image 
from pydot import graph_from_dot_data


In [None]:
data = pd.read_csv('C:\\Users\\DKici\\Documents\\PricingPromo\\data\\pricing_promo_2019_2021_all.csv')
data = data.drop(columns = "Unnamed: 0")
data.head(2)

In [None]:
for i in range(len(data.columns)):
    print(i,data.columns[i])

In [None]:
# EXCLUDE COVID DATA
# data = data.drop(columns = data.columns[80:-4], axis = 1)
# data.head()

In [None]:
data.isna().sum().sum()

In [None]:
data.info()

In [None]:
feature = "Traffic"
# print(data[feature])

In [None]:
data[feature] = pd.to_numeric(data[feature], errors='coerce') 
# print(data.info())
data[feature] 

In [None]:
bins = [-1,40000,55000,70000,85000,100000,130000,600000]
names = [1,2,3,4,5,6,7]

In [None]:
data[feature].max(), data[feature].max()+1000

In [None]:
data["Range"] = pd.cut(data[feature], bins, labels=names)

data["Range"] = pd.Categorical(data["Range"]) 
print(data["Range"].unique())

In [None]:
unique, counts = np.unique(data["Range"], return_counts=True)
plt.bar(unique, counts)

plt.title('Class Frequency')
plt.xlabel('Class')
plt.ylabel('Frequency')

plt.show()

In [None]:
data.isna().sum().sum()

In [None]:
# data = data.dropna()
# data.isna().sum().sum()

In [None]:
data[["Traffic", "Range"]]

In [None]:
print(data["Range"].unique())

In [None]:
data[feature].max(),data["Range"].max()

# Train - Test Split

In [None]:
y = data.Range.astype(int).values

X = data.drop(["Date","Traffic","Margin", "WrittenSales","FinancedAmount","Range"],axis=1).values

X_df = data.drop(["Date","Traffic","Margin", "WrittenSales","FinancedAmount","Range"],axis=1)

In [None]:
y.shape,X.shape

In [None]:
# # %% train test split
from sklearn.model_selection import train_test_split
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=100)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Oversampling

In [None]:
# from imblearn.over_sampling import SMOTE

# print("Before OverSampling, counts of labels': {}".format(Counter(y_train)))

# oversample = SMOTE()
# X_train_res, y_train_res = oversample.fit_resample(X_train, y_train.ravel())

In [None]:
# print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
# print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

# print("After OverSampling, counts of label '1': {}".format(Counter(y_train_res)))

## Applying PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3) # n_neighbors = k
knn.fit(X_train,y_train)
prediction = knn.predict(X_test)
print(" {} nn score: {} ".format(3,knn.score(X_test,y_test)))

In [None]:
knn.get_params().keys()

In [None]:
param_dict = {
    "leaf_size":list(range(1,50)),
    "n_neighbors":list(range(1,300)),
    "p":[1,2]
}

In [None]:
clf = GridSearchCV(knn, 
                   param_grid = param_dict,
                   cv=5)

clf.fit(X_train, y_train.ravel())


In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
knn1 = KNeighborsClassifier(**clf.best_params_)

knn1.fit(X_train, y_train.ravel())

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# X_train = X_train_res.T
# X_test = X_test.T
# y_train = y_train_res.T
# y_test = y_test.T

print("X_train: ",X_train.shape)
print("X_test: ",X_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)

In [None]:
y_train_pre = knn1.predict(X_train)

cnf_matrix_tra = confusion_matrix(y_train, y_train)
class_names = [0,1,2,3,4,5,6,7]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()

In [None]:
print(classification_report(y_train, knn1.predict(X_train)))

In [None]:
y_pre = knn1.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pre)
# Plot non-normalized confusion matrix
class_names = [0,1,2,3,4,5,6,7]
plt.figure()
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
plt.show()

In [None]:
print(classification_report(y_test, knn1.predict(X_test)))

In [None]:
score_list = []
for each in range(1,15):
    knn1 = KNeighborsClassifier(**clf.best_params_)
    knn1.fit(X_train, y_train.ravel())
    score_list.append(knn1.score(X_test,y_test))
    
plt.plot(range(1,15),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()

In [None]:
# !pip install mlxtend

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets, neighbors


In [None]:
from mlxtend.plotting import plot_decision_regions

# Plotting decision region
plot_decision_regions(X_train, y_train, clf=clf, legend=2)
# Adding axes annotations
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Knn with K='+ str(22))
plt.show()