# Product Recommendations

In [1]:
%matplotlib ipympl

import warnings
import numpy as np
import pandas as pd
from sklearn import preprocessing, model_selection, metrics
from sklearn.neighbors import KNeighborsClassifier 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [3]:
# Load data
df = pd.read_excel('Customers.xlsx')
df = pd.get_dummies(df, columns=['Gender', 'Married'])
df = df.drop(['ID', 'Gender_Male', 'Married_No'], axis=1)

y = df['Product'].to_numpy()
X = df.drop('Product', axis=1).to_numpy()

In [4]:
# Prepare data.
def prepare(d):
    return(np.hstack([preprocessing.normalize(d[:, 0:2], axis=0),
                      d[:, 2:4]]))


with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    X = prepare(X)

In [5]:
# Split training and test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [13]:
# Fit the model. Tune the number of neighbors, K
neighbors = list(range(10, 100, 10))

# Cross-validate using 10-fold approach.
# (For each possible value of kTest the model 10 times, randomly leaving out
# 1/10 of training data to evaluate performance)
percent_correct = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = model_selection.cross_val_score(knn, X_train, y_train, cv=10,
                                             scoring='accuracy')
    percent_correct.append(100 * scores.mean())


In [14]:
# Determine best k.
k_best = neighbors[np.argmax(percent_correct)]
k_best

30

In [15]:
# Plot misclassification error vs. K
plt.plot(neighbors, percent_correct)
plt.xlabel('K')
plt.ylabel('Percent Correct')
plt.show()

In [16]:
# Fit model.
knn = KNeighborsClassifier(n_neighbors=k_best)
knn.fit(X_train, y_train)

# Predict outcomes.
predictions = knn.predict(X_test)

# Get accuracy on test data.
'{}%'.format(100 * metrics.accuracy_score(y_test, predictions))

'73.11999999999999%'

In [17]:
# Potential customers.
Xnew = [[25, 90000, 1, 0],
        [55, 30000, 1, 0],
        [50, 70000, 1, 1],
        [25, 75000, 0, 0],
        [58, 45000, 0, 0],
        [28, 70000, 0, 1]]

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    Xnew = prepare(np.array(Xnew))

In [18]:
# Predict products.
knn.predict(Xnew)

array(['A', 'A', 'A', 'C', 'C', 'D'], dtype=object)

In [19]:
# Calculate probabilities.
knn.predict_proba(Xnew)

array([[0.9       , 0.1       , 0.        , 0.        ],
       [0.56666667, 0.43333333, 0.        , 0.        ],
       [0.96666667, 0.03333333, 0.        , 0.        ],
       [0.        , 0.        , 0.9       , 0.1       ],
       [0.        , 0.        , 0.8       , 0.2       ],
       [0.        , 0.        , 0.23333333, 0.76666667]])