# Finding season-dependent keywords

In this notebook, we investigate the reviews based on the season in which they were written. We will find words that are more commonly used in the winter season (specifically from January until March) and those that are more commonly used during the summer (specifically from July until September).

In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
from nltk.tokenize import RegexpTokenizer
import statsmodels.formula.api as smf

In [2]:
DATA_PATH = './datasets/BeerAdvocate/'

In [3]:
# load Beer Advocate data from pickles
reviews = pd.read_pickle(DATA_PATH + 'reviews.pkl')
reviews.head()

Unnamed: 0,Beer Name,Beer Id,Brewery Name,Brewery Id,Style,Abv,Date,Username,User Id,Appearance,Aroma,Palate,Taste,Overall,Rating,Text,Review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 12:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 12:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 12:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 12:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 12:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",


In [5]:
#Find the Month in which the review has been written
reviews['Month'] = reviews['Date'].apply(lambda x: x.month) #adding Month data
styles = reviews['Style'].unique()

In [6]:
def get_words(text):
    word_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    word_list = word_tokenizer.tokenize(text)
    lowercase_words = list(map(lambda x: x.lower(),word_list))
    return lowercase_words

In [16]:
def get_indicator(words,keyword):
    if keyword in words:
        return 1
    else:
        return 0

In [22]:
thresh = 0.1

styles = ['American IPA', 'American Double / Imperial IPA',
       'American Double / Imperial Stout', 'American Pale Ale (APA)',
       'Russian Imperial Stout', 'Saison / Farmhouse Ale', 'American Porter',
       'American Amber / Red Ale', 'American Wild Ale',
       'Fruit / Vegetable Beer', 'Belgian Strong Dark Ale',
       'American Strong Ale', 'Witbier', 'American Brown Ale', 'Tripel',
       'Belgian Strong Pale Ale', 'American Stout', 'American Barleywine',
       'Hefeweizen', 'American Adjunct Lager', 'American Pale Wheat Ale']

for style in styles:
    
    winter_reviews = reviews[(1<=reviews.Month)&(reviews.Month<=3)&(reviews.Style==style)]
    summer_reviews = reviews[(7<=reviews.Month)&(reviews.Month<=9)&(reviews.Style==style)]
    
    winter_words = (winter_reviews)['Text'].apply(get_words).explode()
    summer_words = (summer_reviews)['Text'].apply(get_words).explode()
    n = 100
    winter_total = len(winter_words)
    summer_total = len(summer_words)
    winter_counts = winter_words.value_counts()[:n]
    summer_counts = summer_words.value_counts()[:n]
    winter_large = winter_words.value_counts()
    summer_large = summer_words.value_counts()
    print('Style: {}'.format(style))
    keywords = []
    for word in winter_counts.keys():
        winter_freq = winter_counts[word]/winter_total
        summer_freq = summer_large[word]/summer_total
        if (winter_freq-summer_freq)/winter_freq > thresh:
            keywords.append(word)
            print("\t {} appears {:.5f}% in winter and {:.5f}% in summer".format(word,100*winter_freq,100*summer_freq));
    for word in summer_counts.keys():
        summer_freq = summer_counts[word]/summer_total
        winter_freq = winter_large[word]/winter_total
        if (summer_freq-winter_freq)/summer_freq > thresh:
            keywords.append(word)
            print("\t {} appears {:.5f}% in summer and {:.5f}% in winter".format(word,100*summer_freq,100*winter_freq));
    
    if len(keywords)==0:
        continue
    
    all_rev = pd.concat([winter_reviews,summer_reviews])
    all_rev['Words'] = all_rev['Text'].apply(get_words)

    
    formula = "Rating ~ "
    
    cnt = 0
    
    for keyword in keywords:
        if cnt == 0:
            formula = formula + keyword
        else:
            formula = formula + " + " + keyword
            
        all_rev[keyword] = all_rev['Words'].apply((lambda words: get_indicator(words,keyword)))
        cnt += 1
    
    model = smf.ols(formula=formula, data=all_rev).fit()

    model_summary = model.summary()

    print(model_summary)
    
    

Style: American IPA
	 caramel appears 0.25976% in summer and 0.22846% in winter
                            OLS Regression Results                            
Dep. Variable:                 Rating   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     379.7
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           1.98e-84
Time:                        02:20:36   Log-Likelihood:                -89483.
No. Observations:              124693   AIC:                         1.790e+05
Df Residuals:                  124691   BIC:                         1.790e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------

Style: American Porter
Style: American Amber / Red Ale
	 citrus appears 0.31312% in winter and 0.27332% in summer
	 pine appears 0.20927% in winter and 0.17194% in summer
	 clear appears 0.19349% in winter and 0.17297% in summer
	 ale appears 0.38552% in summer and 0.33886% in winter
	 dark appears 0.23992% in summer and 0.20750% in winter
	 brown appears 0.22544% in summer and 0.19381% in winter
                            OLS Regression Results                            
Dep. Variable:                 Rating   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     331.9
Date:                Fri, 09 Dec 2022   Prob (F-statistic):               0.00
Time:                        02:21:12   Log-Likelihood:                -27279.
No. Observations:               35314   AIC:                         5.457e+04
Df Residuals:                   35307   BIC:   

Style: American Brown Ale
	 roasted appears 0.33610% in summer and 0.29835% in winter
                            OLS Regression Results                            
Dep. Variable:                 Rating   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     84.53
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           4.16e-20
Time:                        02:21:34   Log-Likelihood:                -14053.
No. Observations:               19854   AIC:                         2.811e+04
Df Residuals:                   19852   BIC:                         2.812e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------

Style: American Adjunct Lager
Style: American Pale Wheat Ale
	 be appears 0.32201% in winter and 0.27772% in summer
	 would appears 0.26454% in winter and 0.21460% in summer
	 fruit appears 0.18704% in winter and 0.15762% in summer
	 summer appears 0.32045% in summer and 0.15715% in winter
	 refreshing appears 0.23396% in summer and 0.18504% in winter
                            OLS Regression Results                            
Dep. Variable:                 Rating   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     80.50
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           9.73e-84
Time:                        02:21:48   Log-Likelihood:                -14593.
No. Observations:               16201   AIC:                         2.920e+04
Df Residuals:                   16195   BIC:                         2.925e+04
Df Model:     