In [1]:
# Import Libraries
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Load the Data
df = pd.read_csv("beer_reviews.csv").dropna().drop_duplicates("beer_name")

In [8]:
# Print the First 3 Rows
df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586583,14359,The Defiant Brewing Company,1167357151,4.5,4.0,3.0,TongoRad,Extra Special / Strong Bitter (ESB),4.0,4.5,Big Thumper Ale,5.8,34215
1586588,14359,The Defiant Brewing Company,1295399777,2.0,2.5,3.0,Buddha22,Belgian IPA,2.0,2.5,Bear Mountain Ale,8.0,62147
1586594,14359,The Defiant Brewing Company,1186979266,3.0,3.5,4.0,maddogruss,American Porter,4.0,4.0,Highland Porter,6.0,38160
1586595,14359,The Defiant Brewing Company,1311205969,4.0,3.5,4.0,JerzDevl2000,Hefeweizen,4.0,4.0,Baron Von Weizen,5.0,71234


In [9]:
# Get a count of the number of rows and columns
df.shape

(44075, 13)

In [10]:
# Create a list of important columns to keep
features = ["beer_style", "brewery_name", "review_profilename"]
# features = ["review_aroma", "review_appearance", "review_palate", "review_taste"]
df[features].head(3)

Unnamed: 0,beer_style,brewery_name,review_profilename
0,Hefeweizen,Vecchio Birraio,stcules
1,English Strong Ale,Vecchio Birraio,stcules
2,Foreign / Export Stout,Vecchio Birraio,stcules


In [11]:
# Clean and Process the data
for feature in features:
    df[feature] = df[feature].fillna(0) # Fill any missing values with the empty string

In [13]:
# df[feature]

In [14]:
# Create a function to combine the values of the important columns into a single string
# def combine_features(row):
#     return str(row["review_aroma"]) +" "+ str(row["review_appearance"]) +" "+ str(row["review_palate"]) +" "+ str(row["review_taste"])

def combine_features(row):
    return row["beer_style"] + " " + row["brewery_name"] + " " + row["review_profilename"]


In [15]:
# Apply the function to each row in the data set to store the combined
# strings into a new column called combine_features
df["combined_features"] = df.apply(combine_features, axis = 1)

In [16]:
# Print the dataframe
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,combined_features
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,Hefeweizen Vecchio Birraio stcules
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,English Strong Ale Vecchio Birraio stcules
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,Foreign / Export Stout Vecchio Birraio stcules
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,German Pilsener Vecchio Birraio stcules
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,American Double / Imperial IPA Caldera Brewing...


In [17]:
# Sort by highest to lowest using review_overall
# sorted_values = df.sort_values(by="review_overall", ascending=False).drop_duplicates("beer_name")
# sorted_values.head()

In [18]:
# Print only 5000 rows for smaller sample
df_5000 = df
df_5000.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,combined_features
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,Hefeweizen Vecchio Birraio stcules
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,English Strong Ale Vecchio Birraio stcules
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,Foreign / Export Stout Vecchio Birraio stcules
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,German Pilsener Vecchio Birraio stcules
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,American Double / Imperial IPA Caldera Brewing...


In [19]:
df_5000.insert(loc=0, column='index', value=np.arange(len(df_5000)))

In [20]:
df_5000

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,combined_features
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,Hefeweizen Vecchio Birraio stcules
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,English Strong Ale Vecchio Birraio stcules
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,Foreign / Export Stout Vecchio Birraio stcules
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,German Pilsener Vecchio Birraio stcules
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,American Double / Imperial IPA Caldera Brewing...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586583,44070,14359,The Defiant Brewing Company,1167357151,4.5,4.0,3.0,TongoRad,Extra Special / Strong Bitter (ESB),4.0,4.5,Big Thumper Ale,5.8,34215,Extra Special / Strong Bitter (ESB) The Defian...
1586588,44071,14359,The Defiant Brewing Company,1295399777,2.0,2.5,3.0,Buddha22,Belgian IPA,2.0,2.5,Bear Mountain Ale,8.0,62147,Belgian IPA The Defiant Brewing Company Buddha22
1586594,44072,14359,The Defiant Brewing Company,1186979266,3.0,3.5,4.0,maddogruss,American Porter,4.0,4.0,Highland Porter,6.0,38160,American Porter The Defiant Brewing Company ma...
1586595,44073,14359,The Defiant Brewing Company,1311205969,4.0,3.5,4.0,JerzDevl2000,Hefeweizen,4.0,4.0,Baron Von Weizen,5.0,71234,Hefeweizen The Defiant Brewing Company JerzDev...


In [21]:
# Convert a collection of text to a matrix of token counts

count_matrix = CountVectorizer().fit_transform(df_5000["combined_features"])

In [22]:
# Get the cosine similarity matrix from the count matrix
cosine_sim = cosine_similarity(count_matrix)
# print(cosine_sim)

In [23]:
print(cosine_sim)

[[1.         0.61237244 0.61237244 ... 0.         0.20412415 0.        ]
 [0.61237244 1.         0.5        ... 0.         0.         0.15430335]
 [0.61237244 0.5        1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.6172134  0.57142857]
 [0.20412415 0.         0.         ... 0.6172134  1.         0.6172134 ]
 [0.         0.15430335 0.         ... 0.57142857 0.6172134  1.        ]]


In [24]:
# Get the number of rows and columns in cosine_sim
cosine_sim.shape

(44075, 44075)

In [25]:
df_5000["index"]

0              0
1              1
2              2
3              3
4              4
           ...  
1586583    44070
1586588    44071
1586594    44072
1586595    44073
1586600    44074
Name: index, Length: 44075, dtype: int64

In [114]:
# Helper function to get the title from the index
def get_title_from_index (index):
    return df_5000[df_5000.index == index]["beer_name"].values[0]

# Helper function to get the index from the title
def get_index_from_title(beer_name):
    return df_5000[df_5000.beer_name == beer_name]["index"].values[0]

In [122]:
df_5000["index"][1]

1

In [123]:
df_5000["beer_name"][1]

'Red Moon'

In [138]:
# Get the title of the beer that the user likes
# beer_user_likes = input("Beer Name: ")
beer_user_likes = input("Beer Name: ")

# Find that movies index
beer_index = get_index_from_title(beer_user_likes)

Beer Name: Red Moon


In [139]:
beer_index

1

In [126]:
# Enumerate through all the similarity scores of the beer_user_likes to make
# a tuple of "movie_index" and similarity scores
# NOTE: we will return a list of tuples in the form (movie_index, similarity score)

similar_beers = list( enumerate(cosine_sim[beer_index]) )

In [127]:
similar_beers

[(0, 0.6123724356957946),
 (1, 1.0000000000000002),
 (2, 0.5000000000000001),
 (3, 0.5477225575051662),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.14433756729740646),
 (11, 0.0),
 (12, 0.2886751345948129),
 (13, 0.0),
 (14, 0.0),
 (15, 0.14433756729740646),
 (16, 0.3086066999241838),
 (17, 0.0),
 (18, 0.0),
 (19, 0.1543033499620919),
 (20, 0.0),
 (21, 0.2886751345948129),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.14433756729740646),
 (27, 0.0),
 (28, 0.14433756729740646),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.3086066999241838),
 (35, 0.14433756729740646),
 (36, 0.14433756729740646),
 (37, 0.0),
 (38, 0.1666666666666667),
 (39, 0.1666666666666667),
 (40, 0.1666666666666667),
 (41, 0.1666666666666667),
 (42, 0.1666666666666667),
 (43, 0.0),
 (44, 0.0),
 (45, 0.2886751345948129),
 (46, 0.0),
 (47, 0.4330127018922194),
 (48, 0.14433756729740646),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.1543033499620919),
 (53, 0

In [128]:
# Sort the list of silimar movies sccording to the similarity scores in decending order
sorted_similar_beers = sorted(similar_beers,key = lambda x:x[1], reverse = True)[1:]


In [129]:
sorted_similar_beers

[(3378, 0.6666666666666669),
 (6409, 0.6666666666666669),
 (22040, 0.6666666666666669),
 (30054, 0.6666666666666669),
 (38356, 0.6666666666666669),
 (34909, 0.6172133998483676),
 (0, 0.6123724356957946),
 (3, 0.5477225575051662),
 (9470, 0.5477225575051662),
 (17811, 0.5477225575051662),
 (19599, 0.5477225575051662),
 (25989, 0.5477225575051662),
 (27723, 0.5477225575051662),
 (39487, 0.5477225575051662),
 (2038, 0.5163977794943223),
 (29838, 0.5163977794943223),
 (29854, 0.5163977794943223),
 (29880, 0.5163977794943223),
 (41540, 0.5163977794943223),
 (2, 0.5000000000000001),
 (1569, 0.5000000000000001),
 (2907, 0.5000000000000001),
 (3376, 0.5000000000000001),
 (3694, 0.5000000000000001),
 (4506, 0.5000000000000001),
 (5726, 0.5000000000000001),
 (6019, 0.5000000000000001),
 (7922, 0.5000000000000001),
 (9587, 0.5000000000000001),
 (9591, 0.5000000000000001),
 (11955, 0.5000000000000001),
 (13573, 0.5000000000000001),
 (13694, 0.5000000000000001),
 (13696, 0.5000000000000001),
 (1438

In [130]:
converted = [list(x) for x in sorted_similar_beers]
converted[1]

[6409, 0.6666666666666669]

In [132]:
# tuplex = converted
# tuplex = sorted_similar_beers
# tuplex = tuplex[:1] + tuplex[2:] 
# tuplex

In [137]:
i=0
print(f"The top 5 beers similar to {beer_user_likes} are: ")
for element in sorted_similar_beers:
#     print(get_title_from_index(element[0]))
    print((element[0]))
    i=i+1
    if i>=5:
        break

The top 5 beers similar to Red Moon are: 
3378
6409
22040
30054
38356
