In [1]:
import re
import os.path
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
# import warnings
# warnings.filterwarnings("ignore")

# DATA Path for BeerAdvocate
DATA_FOLDER = 'Data/BeerAdvocate/'
BEER_BA_DATA = DATA_FOLDER+"beers.csv"
BREWERY_BA_DATA = DATA_FOLDER+"breweries.csv"
USERS_BA_DATA = DATA_FOLDER+"users.csv"
REVIEWS_BA_DATA = DATA_FOLDER+"reviews.txt.gz"
RATINGS_BA_DATA = DATA_FOLDER+"ratings.txt.gz"

COMPRESSION = 'gzip'

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')


<spacytextblob.spacytextblob.SpacyTextBlob at 0x18041e13ac0>

In [2]:
review_ba = pd.read_pickle('Data/BeerAdvocate/review_ba.pkl')

## Sentiment analysis

In the <strong>Mileston 2</strong>, we observed that some beers have a marked seasonality. Many reviews contain words such as "summer", "winter", "autumn" or "spring", and we would like to push our analysis further :
* Are those reviews more positive or negative towards the mentioned seasons
* Can we grasp which words are used to qualify the beers in question, e.g does the beer feel warm, sour or bitter


We are looking for words appearing rather frequently to describe a beer which is trendy during Christmas time for example. For that reason, we will conduct an <strong>aspect-based sentiment analysis</strong>.

In order to do that, we will be looking at adjectives and their relative nouns to emphasize the beer's characteristics. That corresponds to <strong>adjectival modifiers</strong> mainly.

In [3]:
def analyze_sentiments(text: str) -> tuple([bool,list[str],list[str]]):
    """Analyze the review to determine wether it is positive or negative

    Args:
        text (str): The text to be analyzed

    Returns:
        sentiment: A boolean; true if the sentiment is positive, false otherwise
    """
    return nlp(text)._.blob.polarity > 0


In [4]:
# a small function to choose wich token should be kept as an aspect, and relevant for the description
selected = lambda token : True if (token.dep_ == "amod" or token.dep_=="compound") and (token.pos_ == "ADJ" or token.pos_=="ADV") else False

def get_aspects(text):
    """From several sentences, detect key aspects and their ajective qualifying the object in question

    Args:
        text (str): the text to be analyzed

    Returns:
        list (set): A list containing pairs of (aspect, description)
    """
    aspects = []
    for sentence in text.split("."):
        for token in nlp(sentence):
            # if (token.dep_ == "amod" or token.dep_=="compound") and (token.pos_ == "ADJ" or token.pos_=="NOUN"):
            if selected(token):
                aspects.append({'aspect': token.head.text, 'description': token})    
    # return {item['aspect'] for item in aspects}, {item['description'] for item in aspects} 
    return aspects


For now let's just focus on the reviews containing "winter".

In [5]:
review_ba["winter"]=review_ba['text'].apply(lambda x : int(bool(len(re.findall('winter',x.strip().lower())))))
beer_winter_style=review_ba.loc[review_ba['winter'] == 1]
beer_winter_style.reset_index(drop=True,inplace=True)
beer_winter_style = beer_winter_style[:1000] # let's just check the 10000 first for now

Let's build a dataframe regrouping the results of the function above for all the "winter" beers. This dataframe should contain :
* a boolean indicating if the review shows appreciation or not towards the beer *is_positive* : 1 for True, 0 otherwise
* a list of aspects characterising the beer, or the feeling of the reviewer *aspects*

In [6]:
winter_reviews = pd.DataFrame()
winter_reviews[["beer_id","style"]] = beer_winter_style[["beer_id","style"]]
winter_reviews["is_positive"] = beer_winter_style.text.apply(lambda text: analyze_sentiments(text))
winter_reviews = winter_reviews[winter_reviews['is_positive']==True]
winter_reviews["aspects"] = beer_winter_style.text.apply(lambda text: get_aspects(text))
winter_reviews.reset_index(inplace=True)
winter_reviews.head()

Unnamed: 0,index,beer_id,style,is_positive,aspects
0,0,20842,English Bitter,True,"[{'aspect': 'pint', 'description': nonic}, {'a..."
1,1,20842,English Bitter,True,"[{'aspect': 'amber', 'description': Clear}, {'..."
2,2,79390,Winter Warmer,True,"[{'aspect': 'bottle', 'description': 500ml}, {..."
3,3,263227,Winter Warmer,True,"[{'aspect': 'town', 'description': small}, {'a..."
4,4,107316,Herbed / Spiced Beer,True,"[{'aspect': 'brass', 'description': clear}, {'..."


In [7]:
dicts = []

for i in range(len(winter_reviews)):
    new_dict = {item['aspect']:item['description'] for item in winter_reviews["aspects"][i]}
    dicts.append(pd.DataFrame.from_dict(new_dict,columns=["descriptions"],orient="index"))

In [8]:
aspects = pd.concat(dicts)
aspects["aspects"] = aspects.index
aspects["index_column"] = range(len(aspects))
aspects.index = aspects["index_column"]
aspects.head()

Unnamed: 0_level_0,descriptions,aspects,index_column
index_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,nonic,pint,0
1,Irish,ingredients,1
2,nice,tan,2
3,nice,cap,3
4,similar,color,4


In [9]:
new_aspects = aspects.groupby('aspects')['descriptions'].apply(lambda x: ','.join(x.astype(str))).reset_index()
# filtering weird aspect caught
new_aspects["aspect_special"]=new_aspects['aspects'].apply(lambda x : True if re.search(r'[^a-zA-Z]+',x) else False)
new_aspects = new_aspects[(new_aspects['aspect_special']==False)]
# sorting the dataframe by the most described aspects 
new_aspects["descri_len"] = new_aspects["descriptions"].apply(lambda x: len(x.split(",")))
new_aspects.sort_values(by="descri_len",ascending=False,inplace=True)

new_aspects.reset_index(inplace=True)
new_aspects.head(15)

Unnamed: 0,index,aspects,descriptions,aspect_special,descri_len
0,938,head,"tan,creamy,dense,White,massive,white,white,dec...",False,620
1,330,beer,"boring,good,balanced,First,excellent,Brisish,n...",False,344
2,459,carbonation,"bright,moderate,moderate,existent,light,medium...",False,256
3,560,color,"similar,dark,true,black,oatmeal,brown,golden,c...",False,196
4,1045,lacing,"patchy,heavy,minimal,little,soapy,Decent,best,...",False,180
5,1107,malt,"grassy,Sweet,full,roasted,toasted,pale,creamy,...",False,166
6,372,body,"wet,coloured,full,creamy,dry,medium,dark,decen...",False,163
7,1110,malts,"dark,roasted,robust,dark,dark,sweet,Roasted,sw...",False,160
8,240,ale,"Irish,dark,English,Scottish,old,old,old,old,ol...",False,148
9,813,finish,"dry,dry,mild,mild,warm,wet,dry,crisp,boozy,alc...",False,137


Based on the dataframe above, the most talked about aspects are : the head (the foam above), the beer itself, the carbonation and the color/colour. For each of those characteristics, we can check the most used words to describe them, and some are standing out.

In [10]:
def top_characteristics(dataframe,nb_of_adjectives=3):
    """gives the top nb_of_characteristics for all aspects of the dataframe

    Args:
        dataframe (pd.DataFrame): the dataframe you want to extract the data from
        nb_of_adjectives (int, optional): the number of yielded adjectives. Defaults to 3.

    Returns:
        List: List with the top adjectives of each aspect
    """
    characteristics = []
    for i in range(len(dataframe)):
        MyList = [word.lower() for word in dataframe["descriptions"][i].split(',')] # splitting by words
        characteristics.append(sorted({i:MyList.count(i) for i in MyList}.items(),key=lambda x: x[1],reverse=True)[:nb_of_adjectives])
    return characteristics

def get_top_adjectives(dataframe,aspect,top=3):
    """Get the top adjectives of a certain aspect

    Args:
        dataframe (pd.DataFrame): the dataframe you want to extract the data from
        aspect (str): the aspect you want to consider
        top (int, optional): how much adjectives you want. Defaults to 3.

    Returns:
        List: the most used adjectives describing the aspect
    """
    try:
        target = dataframe[dataframe["aspects"]==aspect]
        adjectives = [word.lower() for word in target.descriptions.item().split(',')]
        sorted_top = sorted({i:adjectives.count(i) for i in adjectives}.items(),key=lambda x: x[1],reverse=True)[:top]
        return sorted_top
    except:
        raise ValueError("No such aspect identified in dataframe")
    

In [11]:
get_top_adjectives(new_aspects,"malt")

[('sweet', 40), ('roasted', 35), ('toasted', 12)]

Typically for winter beers, a moderate bitterness is appreciated. A creamy white head too, with a clear brown or dark color, while a sweet roasted malt should be liked too.