# Finding season-dependent keywords

In this notebook, we investigate the reviews based on the season in which they were written. We will find words that are more commonly used in the winter season (specifically from January until March) and those that are more commonly used during the summer (specifically from July until September).

In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
from nltk.tokenize import RegexpTokenizer

In [2]:
DATA_PATH = './datasets/BeerAdvocate/'

In [3]:
# load Beer Advocate data from pickles
reviews = pd.read_pickle(DATA_PATH + 'reviews.pkl')
reviews.head()

Unnamed: 0,Beer Name,Beer Id,Brewery Name,Brewery Id,Style,Abv,Date,Username,User Id,Appearance,Aroma,Palate,Taste,Overall,Rating,Text,Review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 12:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 12:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 12:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 12:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 12:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",


In [16]:
#Find the Month in which the review has been written
reviews['Month'] = reviews['Date'].apply(lambda x: x.month) #adding Month data
reviews.head()

Unnamed: 0,Beer Name,Beer Id,Brewery Name,Brewery Id,Style,Abv,Date,Username,User Id,Appearance,Aroma,Palate,Taste,Overall,Rating,Text,Review,Month
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 12:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",,8
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 12:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,,2
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 12:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",,3
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 12:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,,12
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 12:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",,8


In [41]:
def get_words(text):
    word_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    word_list = word_tokenizer.tokenize(text)
    lowercase_words = list(map(lambda x: x.lower(),word_list))
    return lowercase_words

In [47]:
winter_words = (reviews[(1<=reviews.Month)&(reviews.Month<=3)])['Text'].apply(get_words).explode()

In [49]:
summer_words = (reviews[(7<=reviews.Month)&(reviews.Month<=9)])['Text'].apply(get_words).explode()

In [69]:
n = 100
winter_total = len(winter_words)
summer_total = len(summer_words)

In [79]:
winter_counts = winter_words.value_counts()[:n]

In [80]:
summer_counts = summer_words.value_counts()[:n]

In [81]:
winter_large = winter_words.value_counts()[:10*n]

In [82]:
summer_large = summer_words.value_counts()[:10*n]

In [83]:
for word in winter_counts.keys():
    winter_freq = winter_counts[word]/winter_total
    summer_freq = summer_large[word]/summer_total
    if abs(summer_freq-winter_freq)/winter_freq > 0.1:
        print("{} appears {:.5f}% in winter and {:.5f}% in summer".format(word,100*winter_freq,100*summer_freq));

dark appears 0.00395% in winter and 0.00326% in summer
white appears 0.00277% in winter and 0.00310% in summer
chocolate appears 0.00273% in winter and 0.00204% in summer
brown appears 0.00269% in winter and 0.00234% in summer
alcohol appears 0.00227% in winter and 0.00199% in summer
coffee appears 0.00213% in winter and 0.00158% in summer
orange appears 0.00182% in winter and 0.00207% in summer


In [84]:
for word in summer_counts.keys():
    summer_freq = summer_counts[word]/summer_total
    winter_freq = winter_large[word]/winter_total
    if abs(winter_freq-summer_freq)/summer_freq > 0.1:
        print("{} appears {:.5f}% in summer and {:.5f}% in winter".format(word,100*summer_freq,100*winter_freq));

dark appears 0.00326% in summer and 0.00395% in winter
white appears 0.00310% in summer and 0.00277% in winter
brown appears 0.00234% in summer and 0.00269% in winter
orange appears 0.00207% in summer and 0.00182% in winter
chocolate appears 0.00204% in summer and 0.00273% in winter
alcohol appears 0.00199% in summer and 0.00227% in winter
