In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('reviews.csv')
df.review = df.review.astype(str)

In [3]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [4]:
df.head()

Unnamed: 0,product_name,review,user_rating
0,Kentucky Brunch Brand Stout,"2016 Silver Wax. Aroma has whiskey, maple, tof...",4.8
1,Kentucky Brunch Brand Stout,The beer pours Pitch Black with a frothy tan h...,4.74
2,Kentucky Brunch Brand Stout,Probably the smoothest beer I have ever had. S...,4.68
3,Kentucky Brunch Brand Stout,"Dark black, very thick, a little bit of tan he...",5.0
4,Kentucky Brunch Brand Stout,Poured black as ink with thin ruby edges at 58...,4.97


In [5]:
final_df = df.copy()

### Part B

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
def clean(s):
    s = s.lower()
    s_token = tokenizer.tokenize(s)
    s_filtered = [w for w in s_token if not w in stops]
    return s_filtered

In [7]:
df['review_tokenized'] = df['review'].apply(clean)

In [8]:
from collections import Counter

def get_counts(s):
    return Counter(s)

In [9]:
df['word_counts'] = df['review_tokenized'].apply(get_counts)

In [10]:
attribute_counts = {'aggressive':0, 'balanced':0, 'complex':0, 'crisp':0, 'fruity':0, 'hoppy':0, 'malty':0, 'robust':0}
def count_attributes(s):
    for key in s.keys():
        if key in attribute_counts:
            attribute_counts[key] += 1

In [11]:
df['word_counts'].apply(count_attributes)
print(attribute_counts)

{'aggressive': 79, 'balanced': 764, 'complex': 544, 'crisp': 285, 'fruity': 442, 'hoppy': 228, 'malty': 164, 'robust': 103}


In [12]:
attribute_list = ['balanced', 'complex', 'fruity']

### Part C

In [13]:
import spacy
nlp = spacy.load("en")

In [14]:
def get_sim_score(s):
    doc1 = nlp(s)
    doc2 = nlp("fruity complex balanced")
    return doc1.similarity(doc2)

In [15]:
temp = df['review'].apply(get_sim_score)

In [16]:
df['similarity'] = temp

In [17]:
df_top_300 = df.sort_values(by = 'similarity', ascending = False)[:300]

### Part D

In [18]:
df['clean_review'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['clean_review'] = df['clean_review'].str.replace('[^\w\s]','')
df['clean_review'] = df['clean_review'].str.replace('[\d]','')
df['clean_review'] = df['clean_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stops))

In [19]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['clean_review'] = df['clean_review'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [20]:
from textblob import TextBlob

def sentiment(s):
    return TextBlob(s).sentiment[0]

df['sentiment_score'] = df['clean_review'].apply(sentiment)

In [21]:
df_copy = df[['product_name', 'user_rating', 'similarity', 'sentiment_score']].set_index('product_name')
df_copy.sort_values(by = 'sentiment_score', ascending = False)

Unnamed: 0_level_0,user_rating,similarity,sentiment_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Haze,4.99,0.524277,1.000000
Fou' Foune,4.90,0.282876,1.000000
Vicinity,4.96,0.469715,1.000000
A Deal With The Devil,5.00,0.351575,1.000000
Green,4.60,0.430825,1.000000
...,...,...,...
V.S.O.J.,4.36,0.478472,-0.500000
Galaxy Dry Hopped Fort Point Pale Ale,4.75,0.424284,-0.650000
The Rusty Nail,4.50,0.548587,-0.714286
"Somewhere, Something Incredible Is Waiting To Be Known",4.52,0.421191,-0.800000


### Part E

In [22]:
df_group = df_copy.groupby('product_name')[['user_rating', 'similarity', 'sentiment_score']].mean()
df_group.sort_values(by='sentiment_score', ascending = False)[:3]

Unnamed: 0_level_0,user_rating,similarity,sentiment_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Great,4.5732,0.520491,0.297944
Zenne Y Frontera,4.764,0.510925,0.291894
Citraquench'l,4.5208,0.549336,0.285191


We would recommend a customer who is looking for a balanced, complex, and fruity beer to try either the Great, the Zenne Y Frontera, or the Citraquench'l. This recommendation was derived by finding the beers with the most positive reviews that were similar to the features the customer was looking for.

### Part F

In [27]:
df_group_og = pd.DataFrame(final_df.groupby('product_name')['user_rating'].mean())
#df_group_og.sort_values(by='user_rating', ascending=False)[:3]

In [29]:
merged = pd.merge(df_group, df_group_og, left_index=True, right_index=True)
merged.drop('user_rating_y',inplace=True, axis=1)

In [30]:
merged.columns=['user_rating','similarity','sentiment_score']

In [31]:
merged.sort_values(by='user_rating', ascending=False)[:3]

Unnamed: 0_level_0,user_rating,similarity,sentiment_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kentucky Brunch Brand Stout,4.812,0.491261,0.23545
Chemtrailmix,4.811176,0.509298,0.24888
Barrel-Aged Abraxas,4.7964,0.516758,0.190927


If we ignore the similarity and sentiment scores, we would recommend three completely different beers. The beers that we would recommend would be Kentucky Brunch Brand Stout, Chemtrailmix, and Barrel-Aged Abraxas. Neither the similarity scores nor the sentiment scores are on par with the original three recommendations. In fact, you can see that the sentiment scores for these beers are significantly lower, though the user_ratings are the three highest in the data set. Probably most important to note would be the similarity scores though. If a user specificies the types of attributes that are most important to the types of beers that that person likes then you would want to build a recommendation system that recommends beers that cater to those attributes. Thus, these three beers above would clearly not fit the bill for this user.