In [42]:
import pandas as pd
import numpy as np
import gzip #to read gzip files
import matplotlib.pyplot as plt
import datetime
import pickle
from scipy.stats import linregress
import torch
import spacy
import collections
from collections import Counter, defaultdict
import  itertools
#set the seed
np.random.seed(4)

DATA_PATH = './data/'
DEFAULT_ENCODING = 'UTF8'
DEFAULT_COMPRESSION = 'gzip'

In [11]:

matched_beers = pd.read_csv(DATA_PATH+'matched_beer_data/beers.csv', header=1)
matched_beers.head(2)

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
0,4.8,3.45,3.439867,3.504068,80.0,19827,Legbiter,Legbiter,10093,Strangford Lough Brewing Company Ltd,...,4959,Strangford Lough,89,89,23.0,Golden Ale/Blond Ale,27.0,-0.698304,1.0,1.0
1,6.0,3.86,3.88875,3.768333,,20841,St. Patrick's Ale,Patricks Ale St,10093,Strangford Lough Brewing Company Ltd,...,4959,Strangford Lough,11,11,52.0,Irish Ale,79.0,0.005754,0.527141,1.0


In [12]:
matched_users = pd.read_csv(DATA_PATH+'matched_beer_data/users.csv', header=1)
matched_users['joined'] = pd.to_datetime(matched_users['joined'],unit = 's')
matched_users['joined.1'] = pd.to_datetime(matched_users['joined.1'],unit = 's')
matched_users.head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1
0,2008-09-08 10:00:00,Germany,6,6,erzengel.248045,Erzengel,erzengel,2008-10-18 10:00:00,Germany,8781,83106,Erzengel,erzengel
1,2012-09-23 10:00:00,"United States, Virginia",1,1,gendv138.695700,GenDV138,gendv138,2013-11-29 11:00:00,"United States, Virginia",6240,290599,GenDV138,gendv138


Extracting matched data ratings

In [13]:
matched_ratings = pd.read_csv(DATA_PATH+'matched_beer_data/ratings.csv', encoding = "ISO-8859-1", header=1)
matched_ratings.head(2)

Unnamed: 0,abv,appearance,aroma,beer_id,beer_name,brewery_id,brewery_name,date,overall,palate,...,brewery_name.1,date.1,overall.1,palate.1,rating.1,style.1,taste.1,text.1,user_id.1,user_name.1
0,11.3,4.5,4.5,645,Trappistes Rochefort 10,207,Brasserie de Rochefort,1324810800,5.0,4.5,...,Brasserie Rochefort,1387710000,19.0,4.0,4.6,Abt/Quadrupel,9.0,a) Geruch malzig-schwer-sÃÂ¼ÃÂ. Riecht sc...,83106,Erzengel
1,5.0,,,28191,Myanmar Lager Beer,9369,Myanmar Brewery and Distillery,1322650800,,,...,Myanmar Brewery and Distillery,1322564400,6.0,2.0,1.7,Pale Lager,4.0,"Can. Weak and watery, not the best beer of the...",91324,visionthing


In [14]:
df_matched_BA = matched_ratings.iloc[:,:17]
df_matched_BA.set_axis(df_matched_BA.loc[0], axis='columns', inplace=False)
df_matched_BA = df_matched_BA.drop(0)
df_matched_BA.head(2)

  df_matched_BA.set_axis(df_matched_BA.loc[0], axis='columns', inplace=False)


Unnamed: 0,abv,appearance,aroma,beer_id,beer_name,brewery_id,brewery_name,date,overall,palate,rating,review,style,taste,text,user_id,user_name
1,5.0,,,28191,Myanmar Lager Beer,9369,Myanmar Brewery and Distillery,1322650800,,,3.0,True,American Adjunct Lager,,,visionthing.639993,visionthing
2,5.0,3.5,3.5,57911,Cantillon Tyrnilambic Baie DâArgousier Lambic,388,Brasserie Cantillon,1344074400,4.0,4.0,3.85,True,Lambic - Fruit,4.0,"Bottle @ One Pint Pub, Helsinki. 2006 vintage....",tiong.608427,tiong


In [15]:
df_matched_RB = matched_ratings.iloc[:,17:]
df_matched_RB.set_axis(df_matched_RB.loc[0], axis='columns', inplace=False)
df_matched_RB = df_matched_RB.drop(0)
df_matched_RB.head(2)

  df_matched_RB.set_axis(df_matched_RB.loc[0], axis='columns', inplace=False)


Unnamed: 0,abv.1,appearance.1,aroma.1,beer_id.1,beer_name.1,brewery_id.1,brewery_name.1,date.1,overall.1,palate.1,rating.1,style.1,taste.1,text.1,user_id.1,user_name.1
1,5.0,2.0,3.0,17109,Myanmar Lager Beer,2921,Myanmar Brewery and Distillery,1322564400,6.0,2.0,1.7,Pale Lager,4.0,"Can. Weak and watery, not the best beer of the...",91324,visionthing
2,5.0,4.0,8.0,35298,Cantillon Tyrnilambic Baie dÂArgousier,1069,Cantillon,1353582000,17.0,4.0,4.1,Lambic Style - Fruit,8.0,"Bottle @ One Pint Pub, Helsinki. Originally ra...",98624,tiong


### Trying out Spacy

In [27]:
nlp = spacy.load("en_core_web_sm") # calling the effiency module of spacy
doc = nlp("I love cakes.")
print([(w.text, w.pos_) for w in doc])
print(doc)

[('I', 'PRON'), ('love', 'VERB'), ('cakes', 'NOUN'), ('.', 'PUNCT')]
I love cakes.


In [25]:
df_matched_RB['text.1']

1        Can. Weak and watery, not the best beer of the...
2        Bottle @ One Pint Pub, Helsinki. Originally ra...
3        Draught @ÃÂ Pikkulintu, Helsinki, Finland. A ...
4        750ml bottleBottling date: 2011/02/17 - Pours ...
5        375ml bottle @ Pikkulintu, HelsinkiPours orang...
                               ...                        
21959    Valeir Divers 33cl bottle from www.belgianbeer...
21960    .................................................
21961    Biere, foin, pain grillÃÂ©, terreux, banane, ...
21962    This is the first blond ale from this brewery....
21963    In honor of Mes, IÃ¢ÂÂm reviewing your beer....
Name: text.1, Length: 21963, dtype: object

In [88]:
nlp = spacy.load('en_core_web_sm')

def def_value():
    return 0


def get_most_freq_adj(reviews):
    total_freq = defaultdict(def_value)
    
    for review in reviews:
        doc = nlp(review.lower())
        adjectives = [token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == 'ADJ')]
        adj_freq = dict(Counter(adjectives))
        for key, value in itertools.chain(adj_freq.items()):
            if key in total_freq:
                 total_freq[key] += value
            else:
                total_freq[key] = value
    return dict(total_freq)





total_adj = get_most_freq_adj(df_matched_RB['text.1'])

sorted_adj = dict(sorted(total_adj.items(), key=lambda item: item[1], reverse=True))

with open('/Users/loickreienbuhl/Documents/EPFL/MA1/ADA/P3/DataframeStorage/adjectives_freq.pickle', 'wb') as f:
    pickle.dump(sorted_adj, f)





In [89]:
with open('/Users/loickreienbuhl/Documents/EPFL/MA1/ADA/P3/DataframeStorage/adjectives_freq.pickle', 'rb') as f:
    adjective_freq = pickle.load(f)


In [90]:
adjective_freq

{'light': 11959,
 'good': 10414,
 'nice': 10411,
 'white': 8815,
 'dark': 8594,
 'sweet': 8528,
 'medium': 7868,
 'brown': 4691,
 'little': 4201,
 'bitter': 4122,
 'great': 4100,
 'dry': 4039,
 'smooth': 4032,
 'moderate': 3813,
 'golden': 3648,
 'hazy': 3513,
 'floral': 3330,
 'earthy': 3224,
 'clear': 3174,
 'big': 2951,
 'thin': 2936,
 'roasted': 2806,
 'small': 2665,
 'herbal': 2627,
 'mild': 2620,
 'creamy': 2390,
 'pale': 2377,
 'black': 2325,
 'decent': 2265,
 'slight': 2261,
 'sour': 2233,
 'spicy': 2082,
 'crisp': 2071,
 'yellow': 2018,
 'strong': 1966,
 'drinkable': 1856,
 'hoppy': 1836,
 'tart': 1759,
 'thick': 1744,
 'bready': 1729,
 'easy': 1688,
 'bodied': 1673,
 'deep': 1626,
 'cloudy': 1563,
 'balanced': 1520,
 'itã¢â\x80â\x99s': 1445,
 'tropical': 1415,
 'red': 1390,
 'enjoyable': 1321,
 'fresh': 1304,
 'clean': 1301,
 'pine': 1298,
 'rich': 1254,
 'high': 1246,
 'solid': 1244,
 'huge': 1207,
 'refreshing': 1203,
 'orange': 1194,
 'belgian': 1191,
 'low': 1190,
 'best'

In [85]:
taste = ['light', 'sweet', 'bitter', 'roasted', 'mild', 'pale', 'sour', 'spicy', 'hoppy', 'strong', 'tart', 'bready']
aroma = ['floral', 'herbal', 'tropical', 'earthy', 'toasted', 'roast', 'nutty']
palate = ['smooth', 'dry', 'thin', 'creamy', 'crisp', 'bodied', 'thick', 'rich', 'heavy', 'foamy']
appearance = ['dark', 'brown', 'golden', 'hazy', 'clear', 'black', 'yellow', 'cloudy', 'red' ]

In [87]:
test = 'I like good Good beers'
test.lower()

'i like good good beers'