In [111]:
import re
import os.path
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from statsmodels.tsa.seasonal import seasonal_decompose

from tqdm import tqdm
tqdm.pandas()
# import warnings
# warnings.filterwarnings("ignore")

# DATA Path for BeerAdvocate
DATA_FOLDER = 'Data/BeerAdvocate/'
BEER_BA_DATA = DATA_FOLDER+"beers.csv"
BREWERY_BA_DATA = DATA_FOLDER+"breweries.csv"
USERS_BA_DATA = DATA_FOLDER+"users.csv"
REVIEWS_BA_DATA = DATA_FOLDER+"reviews.txt.gz"
RATINGS_BA_DATA = DATA_FOLDER+"ratings.txt.gz"

COMPRESSION = 'gzip'

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')


<spacytextblob.spacytextblob.SpacyTextBlob at 0x24190c6cc40>

In [2]:
review_ba = pd.read_pickle('Data/BeerAdvocate/review_ba.pkl')

## Selecting seasonal Christmas beers

The beers we are interested in are the beers having a seasonality (meaning that there are more popular at one precise point of the year throughout multiple years), and especially the popular ones during Christmas.

In [24]:
def season(data,j,id_name, name,nbr_rev):
    
    output = pd.DataFrame(np.nan, index=[j], columns=[name,'max_month','max_season','min_month','min_season','nbr_review'])
    output[name] = id_name
    output['nbr_review'] = nbr_rev
    
    if len(data)<24: #remove data with less than 2 years values
        return(output)
    
    else:
        #decompose seasonnality of reviews
        decompose_result_mult = seasonal_decompose(data, model="multiplicative", period=12)
        seasonal = decompose_result_mult.seasonal
        trend = decompose_result_mult.trend
        
        #remove beers that have less than 5 reviews over their most reviewed month of the year
        max_review_per_month_year = pd.DataFrame(trend).groupby(pd.Grouper(freq='1Y')).max().dropna()
        if sum(max_review_per_month_year['trend']<3)>0:
            return(output)

        
        # Maximum review period
        buffer_max = pd.DataFrame()
        largest_season = seasonal.nlargest(1, keep='all') #find max values
        buffer_max['date'] = largest_season.index  # find time 
        buffer_max['date'] = pd.to_datetime(buffer_max['date']) 
        output['max_month'] = buffer_max['date'].dt.month.median() #median of months
        output['max_season'] = seasonal.max()
            
        # Minimum review period
        buffer_min = pd.DataFrame()
        smallest_season = seasonal.nsmallest(1, keep='all') #find max values
        buffer_min['date'] = smallest_season.index  # find time
        buffer_min['date'] = pd.to_datetime(buffer_min['date']) 
        output['min_month'] = buffer_min['date'].dt.month.median() #median of months
        output['min_season'] = seasonal.min()
        
        return(output)

In [25]:
beer_styles = review_ba['style'].unique() # finds each beer style
review_ba['date'] = pd.to_datetime(review_ba['date'], unit='s')
style_season_ba = pd.DataFrame(columns = ['style','max_month','max_season','min_month','min_season','nbr_review'])
for j in range(len(beer_styles)):
    beer = pd.DataFrame()
    beer = review_ba[(review_ba['style'] == beer_styles[j])] # gets reviews of the given style
    beer.set_index('date', inplace=True)
    nbr_rev = len(beer)
    beer = beer.groupby(pd.Grouper(freq='M'))['text'].count().reset_index(name='Count')
    beer = beer.iloc[beer.loc[beer['Count'] >= 10].index[0]:]
    beer.rename(columns={'Count' :''}, inplace=True)
    beer.set_index('date', inplace=True)
    beer.replace(0,1,inplace=True)
    style_season_ba = pd.concat([style_season_ba, season(beer,j,beer_styles[j],'style',nbr_rev)], ignore_index=False)
style_season_ba = style_season_ba.dropna()

In [27]:
filtered_style_season_ba = style_season_ba[style_season_ba['max_season']>1.3] # safety threshold
filtered_style_season_ba = filtered_style_season_ba[filtered_style_season_ba['nbr_review']>500] # removing noise

In [42]:
december_ba = filtered_style_season_ba[filtered_style_season_ba['max_month'] == 12] # taking only the december-seasoned beers
december_ba = december_ba.reset_index(drop=True)
july_ba = filtered_style_season_ba[filtered_style_season_ba['max_month'] == 7] # july-seasoned beers for comparison
july_ba = july_ba.reset_index(drop=True)

In [43]:
seasonal_beer_ba = pd.DataFrame(columns = ['beer_name','max_month','max_season','min_month','min_season','nbr_review'])

for j in range(len(december_ba)):
    beer = review_ba[(review_ba['style'] == december_ba['style'][j])] # gets reviews of the given style
    names_beer_ba = beer['beer_name'].unique()
    
    for k in range(len(names_beer_ba)):
        beer_ind = beer[(beer['beer_name'] == names_beer_ba[k])]
        beer_id = int(beer_ind['beer_id'].unique()[0])
        nbr_rev = len(beer_ind)
        beer_ind.set_index('date', inplace=True)
        beer_ind = beer_ind.groupby(pd.Grouper(freq='M'))['text'].count().reset_index(name='Count')
        n = beer_ind.loc[beer_ind['Count'] >= 10]
        if n.empty:
            continue
        beer_ind = beer_ind.iloc[beer_ind.loc[beer_ind['Count'] >= 10].index[0]:]
        beer_ind.rename(columns={'Count' :''}, inplace=True)
        beer_ind.set_index('date', inplace=True)
        beer_ind.replace(0,0.01,inplace=True)
        seasonal_beer_ba = pd.concat([seasonal_beer_ba, season(beer_ind,beer_id,names_beer_ba[k],'beer_name',nbr_rev)], ignore_index=False)
        
seasonal_beer_ba = seasonal_beer_ba.dropna()
filtered_seasonal_beer_ba = seasonal_beer_ba[seasonal_beer_ba['max_season']>1.3]
filtered_seasonal_beer_ba = filtered_seasonal_beer_ba[filtered_seasonal_beer_ba['nbr_review']>500]
chritmas_beers = filtered_seasonal_beer_ba[filtered_seasonal_beer_ba['max_month'] == 12]
chritmas_beers.head()

Unnamed: 0,beer_name,max_month,max_season,min_month,min_season,nbr_review
14630,Christmas Ale,12.0,5.289947,9.0,0.101084,707
1881,Samuel Adams Old Fezziwig Ale,12.0,4.004037,8.0,0.089006,1405
7531,Tröegs The Mad Elf,12.0,3.301245,9.0,0.15003,1271
101,Samuel Adams Winter Lager,12.0,3.715739,9.0,0.038102,1985


In [75]:
seasonal_beer_ba = pd.DataFrame(columns = ['beer_name','max_month','max_season','min_month','min_season','nbr_review'])

for j in range(len(july_ba)):
    beer = review_ba[(review_ba['style'] == july_ba['style'][j])] # gets reviews of the given style
    names_beer_ba = beer['beer_name'].unique()
    
    for k in range(len(names_beer_ba)):
        beer_ind = beer[(beer['beer_name'] == names_beer_ba[k])]
        beer_id = int(beer_ind['beer_id'].unique()[0])
        nbr_rev = len(beer_ind)
        beer_ind.set_index('date', inplace=True)
        beer_ind = beer_ind.groupby(pd.Grouper(freq='M'))['text'].count().reset_index(name='Count')
        n = beer_ind.loc[beer_ind['Count'] >= 10]
        if n.empty:
            continue
        beer_ind = beer_ind.iloc[beer_ind.loc[beer_ind['Count'] >= 10].index[0]:]
        beer_ind.rename(columns={'Count' :''}, inplace=True)
        beer_ind.set_index('date', inplace=True)
        beer_ind.replace(0,0.01,inplace=True)
        seasonal_beer_ba = pd.concat([seasonal_beer_ba, season(beer_ind,beer_id,names_beer_ba[k],'beer_name',nbr_rev)], ignore_index=False)
        
seasonal_beer_ba = seasonal_beer_ba.dropna()
filtered_seasonal_beer_ba = seasonal_beer_ba[seasonal_beer_ba['max_season']>1.3]
filtered_seasonal_beer_ba = filtered_seasonal_beer_ba[filtered_seasonal_beer_ba['nbr_review']>500]
july_beers = filtered_seasonal_beer_ba[filtered_seasonal_beer_ba['max_month'] == 7]
july_beers.head()

Unnamed: 0,beer_name,max_month,max_season,min_month,min_season,nbr_review
113585,Hefeweizen,7.0,1.706209,11.0,0.569858,1885
48434,Sierra Nevada Kellerweis Hefeweizen,7.0,1.643195,11.0,0.606268,1376


In [46]:
christmas_reviews = review_ba.loc[review_ba["beer_name"].isin(chritmas_beers["beer_name"])]

In [79]:
july_reviews = review_ba.loc[review_ba["beer_name"].isin(july_beers["beer_name"])]

## Sentiment analysis

In the <strong>Mileston 2</strong>, we observed that some beers have a marked seasonality. Many reviews contain words such as "summer", "winter", "autumn" or "spring", and we would like to push our analysis further :
* Are those reviews more positive or negative towards the mentioned seasons
* Can we grasp which words are used to qualify the beers in question, e.g does the beer feel warm, sour or bitter


We are looking for words appearing rather frequently to describe a beer which is trendy during Christmas time for example. For that reason, we will conduct an <strong>aspect-based sentiment analysis</strong>.

In order to do that, we will be looking at adjectives and their relative nouns to emphasize the beer's characteristics. That corresponds to <strong>adjectival modifiers</strong> mainly.

In [3]:
def analyze_sentiments(text: str) -> tuple([bool,list[str],list[str]]):
    """Analyze the review to determine wether it is positive or negative

    Args:
        text (str): The text to be analyzed

    Returns:
        sentiment: A boolean; true if the sentiment is positive, false otherwise
    """
    return nlp(text)._.blob.polarity > 0


In [4]:
# a small function to choose wich token should be kept as an aspect, and relevant for the description
selected = lambda token : True if (token.dep_ == "amod" or token.dep_=="compound") and (token.pos_ == "ADJ" or token.pos_=="ADV") else False

def get_aspects(text):
    """From several sentences, detect key aspects and their ajective qualifying the object in question

    Args:
        text (str): the text to be analyzed

    Returns:
        list (set): A list containing pairs of (aspect, description)
    """
    aspects = []
    for sentence in text.split("."):
        for token in nlp(sentence):
            # if (token.dep_ == "amod" or token.dep_=="compound") and (token.pos_ == "ADJ" or token.pos_=="NOUN"):
            if selected(token):
                aspects.append({'aspect': token.head.text, 'description': token})    
    # return {item['aspect'] for item in aspects}, {item['description'] for item in aspects} 
    return aspects


Let's build a dataframe regrouping the reviews of interest. This dataframe should contain :
* a boolean indicating if the review shows appreciation or not towards the beer *is_positive* : 1 for True, 0 otherwise
* a list of aspects characterising the beer, or the feeling of the reviewer *aspects*

In [112]:
def build_aspects_dataset(reviews, sentiment=True):
    """Take a pd.DataFrame reviews, and return a dataframe containing the aspects of the reviews' text

    Args:
        reviews (pd.DataFrame): Several reviews; each review must contain an id, style and a text.
        sentiment (boolean): Wether the analyzed reviews should be positive or negative toward the reviewed beer.

    Returns:
        pd.DataFrame: A dataframe containing the aspects involved in the reviews, with the words best qualifying them. 
    """

    # building a dataframe with the is_positive and the aspects
    df = pd.DataFrame()
    df[["beer_id","style"]] = reviews[["beer_id","style"]]
    tqdm.pandas(desc="Analyzing the reviews' sentiment")
    df["is_positive"] = reviews.text.progress_apply(lambda text: analyze_sentiments(text))
    df = df[df['is_positive']==True]
    tqdm.pandas(desc="Getting the reviews' aspects")
    df["aspects"] = reviews.text.progress_apply(lambda text: get_aspects(text))
    df.reset_index(inplace=True)
    # formatting the aspects in a more convenient way
    dicts = []
    for i in range(len(df)):
        new_dict = {item['aspect']:item['description'] for item in df["aspects"][i]}
        dicts.append(pd.DataFrame.from_dict(new_dict,columns=["descriptions"],orient="index"))
    # building a new dataframe with only the aspects and the descriptions
    aspects = pd.concat(dicts)
    aspects["aspects"] = aspects.index
    aspects["index_column"] = range(len(aspects))
    aspects.index = aspects["index_column"]
    # regrouping the aspects dataframe by aspects and merging the words describing them
    new_aspects = aspects.groupby('aspects')['descriptions'].apply(lambda x: ','.join(x.astype(str))).reset_index()
    # filtering weird aspect caught to avoid numbers and weird characters
    new_aspects["aspect_special"]=new_aspects['aspects'].apply(lambda x : True if re.search(r'[^a-zA-Z]+',x) else False)
    new_aspects = new_aspects[(new_aspects['aspect_special']==False)]
    # sorting the dataframe by the most described aspects
    new_aspects["descri_len"] = new_aspects["descriptions"].apply(lambda x: len(x.split(",")))
    new_aspects.sort_values(by="descri_len",ascending=False,inplace=True)
    new_aspects.reset_index(inplace=True)
    return new_aspects


In [113]:
christmas_aspects = build_aspects_dataset(christmas_reviews)
july_aspects = build_aspects_dataset(july_reviews)

Analyzing the reviews' sentiment: 100%|██████████| 6043/6043 [02:44<00:00, 36.65it/s]
Getting the reviews' aspects: 100%|██████████| 6043/6043 [07:56<00:00, 12.68it/s]
Analyzing the reviews' sentiment: 100%|██████████| 3294/3294 [01:23<00:00, 39.53it/s]
Getting the reviews' aspects: 100%|██████████| 3294/3294 [04:05<00:00, 13.42it/s]


In [85]:
def top_characteristics(dataframe,nb_of_adjectives=3):
    """gives the top nb_of_characteristics for all aspects of the dataframe

    Args:
        dataframe (pd.DataFrame): the dataframe you want to extract the data from
        nb_of_adjectives (int, optional): the number of yielded adjectives. Defaults to 3.

    Returns:
        List: List with the top adjectives of each aspect
    """
    characteristics = []
    for i in range(len(dataframe)):
        MyList = [word.lower() for word in dataframe["descriptions"][i].split(',')] # splitting by words
        characteristics.append(sorted({i:MyList.count(i) for i in MyList}.items(),key=lambda x: x[1],reverse=True)[:nb_of_adjectives])
    return characteristics

def get_top_adjectives(dataframe,aspect,top=3):
    """Get the top adjectives of a certain aspect

    Args:
        dataframe (pd.DataFrame): the dataframe you want to extract the data from
        aspect (str): the aspect you want to consider
        top (int, optional): how much adjectives you want. Defaults to 3.

    Returns:
        List: the most used adjectives describing the aspect
    """
    try:
        target = dataframe[dataframe["aspects"]==aspect]
        adjectives = [word.lower() for word in target.descriptions.item().split(',')]
        sorted_top = sorted({i:adjectives.count(i) for i in adjectives}.items(),key=lambda x: x[1],reverse=True)[:top]
        return sorted_top
    except:
        raise ValueError("No such aspect identified in dataframe")
    