In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the Data
df = pd.read_csv("beer_reviews.csv")

In [3]:
# Print the First 3 Rows
df.head(3)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,2009-02-16 20:57:03,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,2009-03-01 13:44:57,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,2009-03-01 14:10:04,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215


In [4]:
# Get a count of the number of rows and columns
df.shape

(1586614, 13)

In [5]:
# Create a list of important columns to keep
features = ["beer_style", "review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste", "beer_abv"]
df[features].head(3)

Unnamed: 0,beer_style,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,Hefeweizen,1.5,2.0,2.5,1.5,1.5,5.0
1,English Strong Ale,3.0,2.5,3.0,3.0,3.0,6.2
2,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,6.5


In [6]:
# Clean and Process the data
for feature in features:
    df[feature] = df[feature].fillna("") # Fill any missing values with the empty string

In [7]:
# Create a function to combine the values of the important columns into a single string
def combine_features(row):
    return str(row["beer_style"]) +" "+ str(row["review_overall"]) +" "+ str(row["review_aroma"]) +" "+ str(row["review_appearance"]) +" "+ str(row["review_palate"]) +" "+ str(row["review_taste"]) +" "+ str(row["beer_abv"])


In [8]:
# Apply the function to each row in the data set to store the combined
# strings into a new column called combine_features
df["combined_features"] = df.apply(combine_features, axis = 1)

In [9]:
# Print the dataframe
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,combined_features
0,10325,Vecchio Birraio,2009-02-16 20:57:03,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,Hefeweizen 1.5 2.0 2.5 1.5 1.5 5.0
1,10325,Vecchio Birraio,2009-03-01 13:44:57,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,English Strong Ale 3.0 2.5 3.0 3.0 3.0 6.2
2,10325,Vecchio Birraio,2009-03-01 14:10:04,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,Foreign / Export Stout 3.0 2.5 3.0 3.0 3.0 6.5
3,10325,Vecchio Birraio,2009-02-15 19:12:25,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,German Pilsener 3.0 3.0 3.5 2.5 3.0 5.0
4,1075,Caldera Brewing Company,2010-12-30 18:53:26,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,American Double / Imperial IPA 4.0 4.5 4.0 4.0...


In [None]:
df

In [None]:
df

In [None]:
# Convert a collection of text to a matrix of token counts
count_matrix = CountVectorizer().fit_transform(df["combined_features"])

In [None]:
# Get the cosine similarity matrix from the count matrix
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

In [None]:
# Get the number of rows and columns in cosine_sim
