## Import libraries

In [1]:
import pandas as pd
import numpy as np
from statistics import variance
import requests
from sklearn.model_selection import train_test_split

## Reading the CSV file

In [2]:
df = pd.read_csv('DataGP1.csv')
print(len(df),)
df.head()

544


Unnamed: 0.1,Unnamed: 0,product_name,allergies,p_price,verifications,rating,n_rating
0,0,AH Tomatenblokjes gesneden,[],0.55,1823,0.635412,486
1,1,AH Tomatenpuree,[],0.35,449,1.03813,701
2,2,AH Zonnebloemolie,[],2.99,1279,0.416256,638
3,3,AH Tomatenblokjes gesneden 4-pack,[],,536,0.808832,71
4,4,AH Tarwe bloem,[],0.65,804,3.078208,836


# Cleaning the Dataset

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,product_name,allergies,p_price,verifications,rating,n_rating
0,AH Tomatenblokjes gesneden,[],0.55,1823,0.635412,486
1,AH Tomatenpuree,[],0.35,449,1.03813,701
2,AH Zonnebloemolie,[],2.99,1279,0.416256,638
3,AH Tomatenblokjes gesneden 4-pack,[],,536,0.808832,71
4,AH Tarwe bloem,[],0.65,804,3.078208,836


In [4]:
df = df.dropna()

In [5]:
df['rating'].round(decimals=2)
df.head(5)

Unnamed: 0,product_name,allergies,p_price,verifications,rating,n_rating
0,AH Tomatenblokjes gesneden,[],0.55,1823,0.635412,486
1,AH Tomatenpuree,[],0.35,449,1.03813,701
2,AH Zonnebloemolie,[],2.99,1279,0.416256,638
4,AH Tarwe bloem,[],0.65,804,3.078208,836
6,AH Tomatenpuree,[],0.49,1864,4.101798,243


In [6]:
df['containsgluten'] = np.where(df['allergies']!= '[]', True, False)
df.head(18)

Unnamed: 0,product_name,allergies,p_price,verifications,rating,n_rating,containsgluten
0,AH Tomatenblokjes gesneden,[],0.55,1823,0.635412,486,False
1,AH Tomatenpuree,[],0.35,449,1.03813,701,False
2,AH Zonnebloemolie,[],2.99,1279,0.416256,638,False
4,AH Tarwe bloem,[],0.65,804,3.078208,836,False
6,AH Tomatenpuree,[],0.49,1864,4.101798,243,False
7,AH Kristalsuiker,[],0.65,1992,1.593696,16,False
8,AH Tomaten gezeefd passata,[],0.59,1760,2.580782,589,False
9,AH Tortilla naturel wraps,[],0.92,174,1.863368,184,False
10,Yum Yum Chicken flavour instant noodles,[],0.55,1846,4.753379,57,False
11,AH Rinse appelstroop,[],0.79,1574,1.956899,272,False


# Developing the k-NN algorithm

# Iteration 2

## First attempt: failed.
The k-NN score is 0.0, which means that the algorithm cannot find any 'neighbors'.

In [7]:
variable_list = ['p_price', 'verifications', 'rating', 'n_rating']

In [8]:
from sklearn.preprocessing import normalize
X = df[variable_list]
X = normalize(X)
y = df['product_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5) #here I am creating a k-NN classifier, using 5 neigbors. This is a default setting.
knn = knn.fit(X_train, y_train) 
knn.score(X_test, y_test) #here I am calculating the fit that was made on the test data

0.0

In [10]:
# Take 1 % data as sample - ik snap niet hoe ik dit met de hele dataset kan proberen
# Had er zelf 0.99 van gemaakt
rec_sample = df.sample(frac=.99, random_state=1) 

# Shape of the sample data
df.shape

(524, 7)

## Second attempt:
Trying with one variable.

In [11]:
# # Create Item-user matrix using pivot_table()
# rec_pivot = rec_sample.pivot_table(index='product_name', values='rating').fillna(0)

# # Show top-5 records
# rec_pivot.head()

In [12]:
# from sklearn.neighbors import NearestNeighbors
# # Build NearestNeighbors Object
# model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)

# # Fit the NearestNeighbor
# model_nn.fit(rec_pivot)

In [13]:
# # Get top 3 nearest neighbors 
# indices=model_nn.kneighbors(rec_pivot.loc[['AH Olijfolie']], 3, return_distance=False)

# # Print the recommended products
# print("Recommended product:")
# print("-------------------")
# for index, value in enumerate(rec_pivot.index[indices][0]):
#     print((index+1),". ",value)

## Second attempt: trying with multiple variable: partially succeeded.
The algorithm provides recommendations! However, they were not very accurate. Most importantly, the recommendations provided contain gluten. In addition, the product that is scanned, is in the recommendations. Moreover, the recommendations are very random. 

In [14]:
# Create Item-user matrix using pivot_table()
rec_pivot = rec_sample.pivot_table(index='product_name', values=variable_list).fillna(0)

# Show top-5 records
rec_pivot.head()

Unnamed: 0_level_0,n_rating,p_price,rating,verifications
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AH Aardappelpuree,421.0,1.89,4.326369,1363.0
AH Amandelschaafsel,716.0,1.99,2.77295,2061.0
AH Basis pastasaus basillicum,978.0,1.09,0.38645,1653.0
AH Basmati rijst,10.0,2.59,0.717909,2033.0
AH Basmatirijst,550.0,1.99,3.625458,397.0


In [15]:
from sklearn.neighbors import NearestNeighbors
# Build NearestNeighbors Object
model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)

# Fit the NearestNeighbor
model_nn.fit(rec_pivot)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=7)

#### The next cell provided recommendations:

In [24]:
# # Get top 3 nearest neighbors 
# indices=model_nn.kneighbors(rec_pivot.loc[['AH Olijfolie']], 3, return_distance=False)

# # Print the recommended products

# print("Recommended product:")
# for index, value in enumerate(rec_pivot.index[indices][0]):
#     print((index+1),". ", value)

Recommended product:
1 .  AH Olijfolie
2 .  AH Sauzen BBQ pakket
3 .  AH Quiche Mediterrane groente


  for index, value in enumerate(rec_pivot.index[indices][0]):


# Iteration 3: improved in comparison to the second iteration.
Now iteration 2 partly succeeded, I'm going to take a look at how to improve the recommendations.

In [53]:
print(df.loc[df['product_name']== 'No Fairytales Bieten tortilla', 'containsgluten'].item())

False


In the cell above, I tried to find out how to only get the 'containsgluten' variable. Then I used that insight to write the following cell:

In [54]:
empty_list = [None, None, None]
rec_zonder_gluten = 0
product = 'AH Macaroni'
numberofproducts = 3

indices=model_nn.kneighbors(rec_pivot.loc[[product]], 20, return_distance=False)

if df.loc[df['product_name'] == product,'containsgluten'].item() == False:
    print('This product is gluten-free!')
else: 
    for index, value in enumerate(rec_pivot.index[indices][0]):
        if df.loc[df['product_name'] == value,'containsgluten'].item() == False:
            print(value)
            empty_list[rec_zonder_gluten] = value
            rec_zonder_gluten += 1
            if rec_zonder_gluten == numberofproducts:
                break

No Fairytales Bieten tortilla
Patak's Mini naan plain
Lassie Basmatirijst extra vezels


  for index, value in enumerate(rec_pivot.index[indices][0]):


In line 3 you can insert a product out of the CSV file and it will give three recommendations that are gluten-free.

### How iteration 3 improved:
(1) Most importantly: the recommendations provided do not contain gluten. (2) When a product does not contain gluten, it will not provide recommendations anymore.
(3) The recommendations are more accurate. (4) In addition, the scanned product is not in the list of recommendations anymore.