In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import databricks.koalas as ks

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Insert CSV
df_combined_features = pd.read_csv('combined_features_by_reviews_city.csv')
df_combined_features = df_combined_features.rename(columns={"Beer Names": "beer_name", "Index": "index"})

In [3]:
# Change values below where needed based on your CSV and what you want to find
# Helper function to get the title from the index
def get_title_from_index (index):
    return df_combined_features[df_combined_features.index == index]["beer_name"].values[0]

# # Helper function to get the index from the title
def get_index_from_title(beer_name):
    return df_combined_features[df_combined_features.beer_name == beer_name]["index"].values[0]

In [4]:
# Clean, Preview new dataframe here
# df_combined_features = df_combined_features.drop(['Unnamed: 0'])
df_combined_features = df_combined_features.drop_duplicates('beer_name')
df_combined_features.head()

Unnamed: 0.1,Unnamed: 0,index,beer_name,Locations,Reviews,Image,City,combined_features
0,0,0,Cloudwater A Scabrous Edge Of The Sky,"Manchester, Greater Manchester","Light fresh fruit, light floral note, hazy gol...",https://res.cloudinary.com/ratebeer/image/uplo...,Manchester,"Light fresh fruit, light floral note, hazy gol..."
1,1,1,Temperance Might Meets Right: Hot Cocoa,"Evanston, Illinois",Aromas of chocolate bourbon oak toffee cocoa t...,https://res.cloudinary.com/ratebeer/image/uplo...,Evanston,Aromas of chocolate bourbon oak toffee cocoa t...
2,2,2,Pipeworks Barrel Aged The Brown and Stirred,"Chicago, Illinois","Pours opaque cinnamon, no head. Aroma features...",https://res.cloudinary.com/ratebeer/image/uplo...,Chicago,"Pours opaque cinnamon, no head. Aroma features..."
3,3,3,Camba Bavaria 4 Sessions,"Seeon, Bavaria","Schaum: fein, gut.\nFarbe: orange-Gold, leicht...",https://res.cloudinary.com/ratebeer/image/uplo...,Seeon,"Schaum: fein, gut.\nFarbe: orange-Gold, leicht..."
4,4,4,Rock & Roll Bramble On,"Birmingham - Hockley, West Midlands",Cask @ the brewhouse. Nice head with good dura...,https://res.cloudinary.com/ratebeer/image/uplo...,Birmingham - Hockley,Cask @ the brewhouse. Nice head with good dura...


In [5]:
# Convert a collection of text to a matrix of token counts
count_matrix = CountVectorizer().fit_transform(df_combined_features["combined_features"])

In [6]:
# Get the cosine similarity matrix from the count matrix
cosine_sim = cosine_similarity(count_matrix)

In [7]:
# print(cosine_sim)

In [8]:
# Get the number of rows and columns in cosine_sim
cosine_sim.shape

(4917, 4917)

In [9]:
# Testing index - trying to match index with beer_name
print(f'Index : {df_combined_features["index"][245]}')
print(f'Beer Name: {df_combined_features["beer_name"][245]}')

Index : 245
Beer Name: Stonewood Anno 1136


In [10]:
# Get the title of the beer that the user likes
beer_user_likes = input("Beer Name: ")

# Find that movies index
beer_index = get_index_from_title(beer_user_likes)

Beer Name: Temperance Might Meets Right: Hot Cocoa


In [11]:
# Beer location
beer_index

1

In [12]:
# Enumerate through all the similarity scores of the beer_user_likes to make
# a tuple of "beer_index" and similarity scores
# NOTE: we will return a list of tuples in the form (beer_index, similarity score)

similar_beers = list(enumerate(cosine_sim[beer_index]))

In [13]:
# similar_beers

In [14]:
# Sort the list of silimar beers sccording to the similarity scores in decending order
sorted_similar_beers = sorted(similar_beers,key = lambda x:x[1], reverse = True)[1:]

In [15]:
# sorted_similar_beers

In [16]:
# Looping to find similar beers, using the helper functions
i=0
print(f"Top 5 beers similar to '{beer_user_likes}' are: ")
print(f"\n------------------------------------------------------------")
for element in sorted_similar_beers:
    print(f"\n* Beer Name: {get_title_from_index(element[0])}")
    i=i+1
    if i>=5:
        break

print(f"\n------------------------------------------------------------")

Top 5 beers similar to 'Temperance Might Meets Right: Hot Cocoa' are: 

------------------------------------------------------------

* Beer Name: Cascade Lakes RDM DIPA

* Beer Name: Squatters Hop Rising Tropical Double IPA

* Beer Name: Hogs Back Englands Glory

* Beer Name: Wylam The Economic Growth Of The Geek

* Beer Name: Evil Twin Watermelon Spritzer

------------------------------------------------------------


In [17]:
# Printing beer names and similarity scores
i=0
print(f"Top 5 beers similar to '{beer_user_likes}' are: ")
print(f"\n------------------------------------------------------------")
for i in range( len(sorted_similar_beers)):
    print('\n* Beer Name:',get_title_from_index(sorted_similar_beers[i][0]), '\n* Similarity Score: ' ,sorted_similar_beers[i][1] )
    i=i+1
    if i>=5:
        break 

print(f"\n------------------------------------------------------------")

Top 5 beers similar to 'Temperance Might Meets Right: Hot Cocoa' are: 

------------------------------------------------------------

* Beer Name: Cascade Lakes RDM DIPA 
* Similarity Score:  0.5930469674071855

* Beer Name: Squatters Hop Rising Tropical Double IPA 
* Similarity Score:  0.5689255871169124

* Beer Name: Hogs Back Englands Glory 
* Similarity Score:  0.5245305283129622

* Beer Name: Wylam The Economic Growth Of The Geek 
* Similarity Score:  0.5135525910130955

* Beer Name: Evil Twin Watermelon Spritzer 
* Similarity Score:  0.4111364906901773

------------------------------------------------------------
