# Imports and Config

In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from scipy import stats

In [2]:
# Install necessary nltk packages
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edgar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edgar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Path to beers csv file
data_path = "BeerAdvocate/beers.csv"

# Data Loading

In [4]:
beers = pd.read_csv(data_path)

In [5]:
# Keep only significant beers and columns (at least 10 reviews)
rated_beers = beers[beers["nbr_ratings"] >= 10][["beer_id", "beer_name", "nbr_ratings", "avg"]]

# Evaluating the Importance of Specific Keywords in Beer Naming

In [7]:
# Load stemmer to group similar words
stemmer = PorterStemmer()

In [8]:
# A set of stop words to avoid, use english as of now for simplicity
stop_words = set(stopwords.words("english"))

In [9]:
def convert_name_to_stemmed_keywords(name):
    # Lowercase and remove punctuation
    processed_name = name.lower()
    processed_name = processed_name.translate(str.maketrans('', '', string.punctuation))

    # Split by word and remove stop words
    tokens = word_tokenize(processed_name)
    tokens = [word for word in tokens if word not in stop_words]

    # Stem words to handle similar forms of words (tense, plural, ...)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Return these stemmed words as the keywords of the beer name
    return stemmed_tokens

In [10]:
# Add keywords column with stemmed keywords for each beer name
rated_beers["keywords"] = rated_beers["beer_name"].apply(convert_name_to_stemmed_keywords)

In [11]:
# Find the number, average, standard deviation, minimum and maximum beer ratings for each keyword
exploded_keywords = rated_beers.explode("keywords")
keyword_ratings = exploded_keywords.groupby("keywords")["avg"].agg(["mean", "count", "std", "min", "max"]).reset_index()

In [12]:
# Keep only keywords which are common and significant (>= 100 beers with keyword)
significant_keywords = keyword_ratings[keyword_ratings["count"] >= 100]

In [23]:
# Display 10 most positively significant keywords
ten_best = significant_keywords.nlargest(10, "mean")
ten_best

Unnamed: 0,keywords,mean,count,std,min,max
24475,w,4.150929,140,0.368165,2.19,4.84
3188,blend,4.144384,146,0.334802,2.62,4.86
430,2016,4.108108,148,0.317328,2.99,4.87
15491,mosaic,4.094598,174,0.259879,2.83,4.78
9268,galaxi,4.094586,133,0.242529,3.35,4.56
2643,bean,4.085391,128,0.334089,3.05,4.77
3677,brandi,4.076615,130,0.384104,2.51,4.8
20863,simco,4.07322,118,0.258344,3.15,4.67
12400,juic,4.072736,106,0.303602,3.36,4.83
1183,age,4.056598,1746,0.322567,2.17,4.9


In [24]:
# Display 10 most negatively significant keywords
ten_worst = significant_keywords.nsmallest(10, "mean")
ten_worst

Unnamed: 0,keywords,mean,count,std,min,max
13725,light,2.964534,311,0.696015,1.5,4.52
18197,premium,3.105729,288,0.464278,1.84,4.16
17701,pilsen,3.236937,111,0.432113,2.0,4.04
8108,export,3.338774,106,0.545017,2.11,4.4
14376,malt,3.373365,104,0.675532,1.49,4.87
13302,lager,3.405871,1390,0.450432,1.58,4.35
1462,amber,3.465235,596,0.330099,1.38,4.36
20051,saranac,3.508545,110,0.259424,2.8,4.41
16773,origin,3.518962,183,0.464663,2.09,4.51
3235,blond,3.523381,698,0.357753,2.03,4.82


In [25]:
# T-test to confirm significant differences between most polarizing keywords
with_keyword_1 = rated_beers[rated_beers["keywords"].apply(lambda x: "w" in x)]["avg"]
with_keyword_2 = rated_beers[rated_beers["keywords"].apply(lambda x: "light" in x)]["avg"]

In [26]:
t_stat, p_val = stats.ttest_ind(with_keyword_1, with_keyword_2)
t_stat, p_val

(18.952795915475715, 2.965913416175236e-59)

In [27]:
# T-test to confirm less or no differences between similarly ranked keywords
with_keyword_1 = rated_beers[rated_beers["keywords"].apply(lambda x: "w" in x)]["avg"]
with_keyword_2 = rated_beers[rated_beers["keywords"].apply(lambda x: "blend" in x)]["avg"]

In [28]:
t_stat, p_val = stats.ttest_ind(with_keyword_1, with_keyword_2)
t_stat, p_val

(0.18638182347893084, 0.8522788779103688)

In [31]:
# T-test between beers with any of the top 10 rated kaywords vs top 10 worst rated keywords
with_keyword_1 = rated_beers[rated_beers["keywords"].apply(lambda x: any(keyword in x for keyword in list(ten_best["keywords"])))]["avg"]
with_keyword_2 = rated_beers[rated_beers["keywords"].apply(lambda x: any(keyword in x for keyword in list(ten_worst["keywords"])))]["avg"]

In [32]:
t_stat, p_val = stats.ttest_ind(with_keyword_1, with_keyword_2)
t_stat, p_val

(63.9879409398508, 0.0)