In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import random

data = pd.read_csv("./preprocessed_data.csv")
food = pd.read_csv("./food_data.csv")

In [None]:
print(data.columns)
print(data.shape)


In [None]:
print(food.columns)
print(food.shape)

In [None]:
# set id for matching
food.index = food.id
data.index = data.id

## Process Step by Step

1. PCA for dimensionality reduction
    - decide what components to keep
2. Use elbow method to determine how many clusters to use
3. Kmeans clustering
4. Evaluation of kmeans prediction sillhouette score
5. Get three other closest points to the prediction used as our recommendations
    - map them to id

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA

s = StandardScaler()
scaled_data = s.fit_transform(data)
scaled_data

# Perform PCA first for dimensionality reduction
pca = PCA(2)
pca_dat = pca.fit_transform(scaled_data)
pca_result = pd.DataFrame(pca_dat, columns=["PC-1", "PC-2"], index=data.index)

# Performing K-means itterations to get inertia for determining clusters
intertia_list = []
for k in range(1, 9):
    kmeans = KMeans(n_clusters=k, init="k-means++")
    kmeans.fit(pca_result)
    intertia_list.append(kmeans.inertia_)

K = 9
plt.plot(range(1,K), intertia_list, 'purple')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method')
plt.show()


This indicates that the fall off point for the elbow method is 3 thus we should proceed with an optimal value of three clusters.

In [None]:
from sklearn.cluster import KMeans

# Perform optimal k means
kmeans_m1 = KMeans(n_clusters=3, init="k-means++")
kmeans_centers = np.array(
    kmeans_m1.fit(pca_result).cluster_centers_)
label = kmeans_m1.fit_predict(pca_result)

plt.scatter(pca_result["PC-1"], 
            pca_result["PC-2"],
            c=label, cmap="Pastel2")
plt.scatter(kmeans_centers[:, 0],
            kmeans_centers[:, 1],
            marker="o", c="red")
plt.xlabel("PCA_1")
plt.ylabel("PCA_2")
plt.title("kmeans_plot")
plt.show()

food['cluster_id'] = label

In [None]:
from scipy.spatial import distance_matrix

sim_matrix = pd.DataFrame(distance_matrix(pca_result.values, pca_result.values), index=pca_result.index, columns=pca_result.index)
sim_matrix

In [None]:
# Here is a small input of what a user could possibly search in the system

#user_input = "greek romaine lettuce cheese"
#user_input = "japanese noodles egg pork"
user_input = "greek romaine lettuce feta"
# search variables
def food_recommender(user_input):
    cuisine_type = None
    cuisine_dict = food.cuisine.unique()
    filtered_food = pd.DataFrame()
    food["count"] = 0
    
    for cuisine in cuisine_dict:
        if cuisine in user_input:
            cuisine_type = cuisine
    
    for index, row in food.iterrows():
        
        word_relevancy = 0
        user_split = user_input.split(" ")

        for word in user_split:
            if word in row.ingredients:
                word_relevancy +=1

        if word_relevancy != 0:
            row["count"] = word_relevancy
            filtered_food = filtered_food.append(row)
            
    if cuisine_type:
        filtered_food = filtered_food[filtered_food["cuisine"] == cuisine_type]
    
    filtered_food = filtered_food[filtered_food["count"] == filtered_food["count"].max()]
    
    print("Similar results gathered from query: " + str(filtered_food.shape[0]))
    
    base_rec = random.choice(list(filtered_food["id"]))
    base_rec = filtered_food[filtered_food["id"] == base_rec]
    base_rec.reset_index(drop=True)
    food_det = base_rec["id"]
    
    print("Food identified: " + food_det.to_string(index=False))
    print("Relevant key words: " + str(base_rec['count'].to_string(index=False)))
    print("Ingredients list :" + str(base_rec.ingredients.to_string(index=False)))
    
    print('\n')
    print('[Recommendations]')

    temp_values = []

    for x in sim_matrix[food_det].values:
        temp_values.append(x[0])
    temp_df = pd.DataFrame()
    temp_df["data"] = temp_values
    temp_df.index = sim_matrix.index

    similar_food = list((temp_df.sort_values(by='data',ascending=True)).head(3).index)
    recommend_df = food.loc[food['id'].isin(similar_food)]
    recommend_df = recommend_df.drop(columns =["cluster_id", 'count'])

    print(recommend_df)

    return(recommend_df)
    
    
    
final_results = food_recommender(user_input)
        

In [None]:
from googlesearch import search
import requests 
from bs4 import BeautifulSoup 
from IPython.display import Image
from IPython.core.display import HTML 
from IPython.core.display import Image, display
from matplotlib import pyplot as plt
import matplotlib.image as mpimg

def rec_viz(final_results):
    for row in final_results.values:
        print("food_id: " + str(row[0]))
        query = str(row[1])
    
        for j in search(query, tld="co.in", num=1, stop=1, pause=0):
            url = j
                
            def getdata(url): 
                r = requests.get(url) 
                return r.text 
                
            htmldata = getdata(url) 
            soup = BeautifulSoup(htmldata, 'html.parser')
            title= str(soup.find_all('title'))
            
            title = title.replace('<title>','')
            title = title.replace('</title>','')
            title = title.replace(']','')
            title = title.replace('[','')
            title = title.replace('<title data-react-helmet="true">', '')
            title = title.split(", ")
            print(title[0])
            
            
            counter = 0
            for item in soup.find_all('img'):
                if '//' in item['src']:
                    
                    if counter <= 0:
                        #print(item['src'])
                        display(Image(url= item['src'], width = 300, height = 300, unconfined=True))

                        #display(Image(url= item['src'], unconfined=True))
                    counter+=1
                    
rec_viz(final_results)