In [45]:
# Import Libraries
import pandas as pd
import numpy as np
import databricks.koalas as ks

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
# Print only 5000 rows for smaller sample
df_combined_features = pd.read_csv('data/df_combined_features.csv')
# df_5000.sample(n=500).to_csv('df500.csv')
# Helper function to get the title from the index
def get_title_from_index (index):
    return df_combined_features[df_combined_features.index == index]["beer_name"].values[0]

# # Helper function to get the index from the title
def get_index_from_title(beer_name):
    return df_combined_features[df_combined_features.beer_name == beer_name]["index"].values[0]

In [47]:
df_combined_features.drop(['Unnamed: 0', 'review_time', 'review_overall','review_aroma','review_appearance', 'review_palate', 'review_taste', 'beer_beerid', 'brewery_id'], axis=1, inplace=True)
df_combined_features.head(3)

Unnamed: 0,index,brewery_name,review_profilename,beer_style,beer_name,beer_abv,combined_features
0,0,Vecchio Birraio,stcules,Hefeweizen,Sausa Weizen,5.0,Hefeweizen Vecchio Birraio stcules
1,1,Vecchio Birraio,stcules,English Strong Ale,Red Moon,6.2,English Strong Ale Vecchio Birraio stcules
2,2,Vecchio Birraio,stcules,Foreign / Export Stout,Black Horse Black Beer,6.5,Foreign / Export Stout Vecchio Birraio stcules


In [4]:
# Convert a collection of text to a matrix of token counts
count_matrix = CountVectorizer().fit_transform(df_combined_features["combined_features"])

In [6]:
# Get the cosine similarity matrix from the count matrix
cosine_sim = cosine_similarity(count_matrix)

In [7]:
# print(cosine_sim)

In [8]:
# Get the number of rows and columns in cosine_sim
cosine_sim.shape

(44075, 44075)

In [32]:
# Testing index - trying to match index with beer_name
print(f'Index : {df_combined_features["index"][245]}')
print(f'Beer Name: {df_combined_features["beer_name"][245]}')

Index : 245
Beer Name: Brown Ale


In [33]:
# Get the title of the beer that the user likes
beer_user_likes = input("Beer Name: ")

# Find that movies index
beer_index = get_index_from_title(beer_user_likes)

Beer Name: Brown Ale


In [34]:
# Beer location
beer_index

245

In [35]:
# Enumerate through all the similarity scores of the beer_user_likes to make
# a tuple of "beer_index" and similarity scores
# NOTE: we will return a list of tuples in the form (beer_index, similarity score)

similar_beers = list(enumerate(cosine_sim[beer_index]))

In [36]:
# similar_beers

In [37]:
# Sort the list of silimar beers sccording to the similarity scores in decending order
sorted_similar_beers = sorted(similar_beers,key = lambda x:x[1], reverse = True)[1:]

In [38]:
# sorted_similar_beers

In [39]:
# Looping to find similar beers, using the helper functions
i=0
print(f"Top 5 beers similar to '{beer_user_likes}' are: ")
print(f"\n------------------------------------------------------------")
for element in sorted_similar_beers:
    print(f"\n* Beer Name: {get_title_from_index(element[0])}")
    i=i+1
    if i>=5:
        break

print(f"\n------------------------------------------------------------")

Top 5 beers similar to 'Brown Ale' are: 

------------------------------------------------------------

* Beer Name: IPA

* Beer Name: Boatswain H.L.V. Ale (Heavy Lift Vessel)

* Beer Name: Duck-Rabbit Brown Ale

* Beer Name: Ipswich Dark Ale

* Beer Name: Farmington River Brown Ale

------------------------------------------------------------


In [40]:
# Printing beer names and similarity scores
i=0
print(f"Top 5 beers similar to '{beer_user_likes}' are: ")
print(f"\n------------------------------------------------------------")
for i in range( len(sorted_similar_beers)):
    print('\n* Beer Name:',get_title_from_index(sorted_similar_beers[i][0]), '\n* Similarity Score: ' ,sorted_similar_beers[i][1] )
    i=i+1
    if i>=5:
        break 

print(f"\n------------------------------------------------------------")

Top 5 beers similar to 'Brown Ale' are: 

------------------------------------------------------------

* Beer Name: IPA 
* Similarity Score:  0.7715167498104596

* Beer Name: Boatswain H.L.V. Ale (Heavy Lift Vessel) 
* Similarity Score:  0.7142857142857141

* Beer Name: Duck-Rabbit Brown Ale 
* Similarity Score:  0.6299407883487119

* Beer Name: Ipswich Dark Ale 
* Similarity Score:  0.6299407883487119

* Beer Name: Farmington River Brown Ale 
* Similarity Score:  0.6299407883487119

------------------------------------------------------------
