In [377]:
import json
import os
import bokeh
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import ast
from collections import Counter 

## Get Data

In [324]:
interim_data_path = os.getcwd()
interim_data_path = os.path.dirname(interim_data_path)
interim_data_path = os.path.dirname(interim_data_path)
interim_data_path += '/data/interim/ratings.jl'
interim_data_path

'/Users/brianelinsky/Dropbox/ActiveProjects/coffee_analytics/data/interim/ratings.jl'

In [325]:
line_list = []
with open(interim_data_path) as f:
    for line in f:
        a_dict = json.loads(line)
        df = pd.DataFrame(a_dict, index=[0])
        line_list.append(df)

df = pd.concat(line_list)
df = df.reset_index(drop=True)
df

Unnamed: 0,roaster,bean,rating,roaster_location,coffee_origin,roast_level,agtron,estimated_price,review_date,aroma,acidity_structure,body,flavor,aftertaste,with_milk,blind_assessment,notes,bottom_line,who_should_drink_it
0,Dragonfly Coffee Roasters,Colombia Nariño,91,"Henderson, Nevada","Nariño Department, southern Colombia",Medium-Light,58/72,$12.50/12 ounces,April 2021,8,8,8\t\t\t\t\t\t,9,8,,"Crisply sweet, chocolaty. Baking chocolate, ra...",,"A friendly, accessible (and accessibly priced)...",
1,True Coffee Roasters,Belquis Ramirez Lemus Colombia Coffee for Peace,92,"Fitchburg, Wisconsin","Cauca Department, Colombia",Medium-Light,58/76,$14.20/12.64 ounces,March 2021,8,9,8\t\t\t\t\t\t,9,8,,"Balanced, bright. Cocoa nib, orange zest, almo...",,"A good-value, balanced Colombia cup, juicy and...",
2,True Coffee Roasters,Asprole Blend #1 Colombia Coffee for Peace,92,"Fitchburg, Wisconsin","Cauca Department, Colombia",Medium-Light,59/77,$14.20/12.64 ounces,March 2021,9,8,8\t\t\t\t\t\t,9,8,,"Crisply sweet, chocolaty. Baking chocolate, ap...",,"A friendly, accessible Colombia cup — briskly ...",
3,Kakalove Cafe,Costa Rica Aris Red Honey Lot 2002 Espresso,94,"Chia-Yi, Taiwan","Central Valley, Costa Rica",Medium-Light,52/70,NT $270/8 ounces,March 2021,9,,9\t\t\t\t\t\t,9,8,9,"Evaluated as espresso. Richly aromatic, deeply...",,A cleanly fruit-forward Costa Rica coffee roas...,
4,Interpretation Coffee,Ethiopia Gedeo Kochere Reko Natural,94,"Chia-Yi, Taiwan","Yirgacheffe growing region, southern Ethiopia",Light,64/84,$16.00/230 grams,March 2021,9,9,9\t\t\t\t\t\t,9,8,,"Richly sweet-tart, fruit-toned. Dried raspberr...",,A cleanly fruit-centered Ethiopia cup with int...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6045,Peet's Coffee & Tea,Major Dickason’s Blend,84,"Berkeley, California",,Dark,36/41,,February 1997,7,,7\t\t\t\t\t\t,7,,,"The body is full, the taste richly pungent. Th...","The body is full, the taste richly pungent as ...",,Peet's has always stood proudly at the darkest...
6046,Backroads Coffee and Tea,House Blend,84,"Hayward, Wisconsin",,Medium,52/61,,February 1997,7,,7\t\t\t\t\t\t,8,,,The acidity doesn't immediately reveal itself ...,"Understated complexity develops into a deep, m...",,Another all-around winner for drip brewing. Sh...
6047,Starbucks Coffee,House Blend,81,"Seattle, Washington",,Dark,37/41,,February 1997,7,,6\t\t\t\t\t\t,8,,,The rest of the taste profile plays peek-a-boo...,The rest of the taste profile plays peek-a-boo...,,Everyone who's gotten hooked on the carbon-ove...
6048,Seattle's Best Coffee,Seattle’s Best Blend,88,"Seattle, Washington","Indonesia, Central and South America",Medium-Dark,47/52,,February 1997,7,,8\t\t\t\t\t\t,8,,,"This coffee gets better as it goes, probably o...",Coffee with a heavy body that get better as it...,,Classicists who brew with a French press; peop...


# Clean and Process Data

In [326]:
df.dtypes

roaster                object
bean                   object
rating                 object
roaster_location       object
coffee_origin          object
roast_level            object
agtron                 object
estimated_price        object
review_date            object
aroma                  object
acidity_structure      object
body                   object
flavor                 object
aftertaste             object
with_milk              object
blind_assessment       object
notes                  object
bottom_line            object
who_should_drink_it    object
dtype: object

In [327]:
def insert_nans(df, field: str):
    """Insert NANs"""
    df[field].replace(to_replace=['NA', 'NR'], value=np.nan, inplace=True)
    
def convert_to_float(df, field: str):
    df[field] = df[field].astype('float64')
    
def remove_tabs(df, field: str):
    df[field].replace(to_replace=r'\t', value='', regex=True, inplace=True)

In [328]:
df['with_milk'].replace(to_replace=r'Flavor in milk: ', value='', regex=True, inplace=True)

for field in ['rating', 'aroma', 'acidity_structure', 'flavor', 'aftertaste', 'body', 'with_milk']:
    remove_tabs(df, field)
    insert_nans(df, field)
    convert_to_float(df, field)

In [331]:
# convert roast type to a categorical variable
insert_nans(df, 'roast_level')
df['roast_level'] = df['roast_level'].astype('category')

# Set order for categories
df['roast_level'] = df['roast_level'].cat.reorder_categories(['Light', 'Medium-Light', 'Medium', 'Medium-Dark', 'Dark', 'Very Dark'], ordered=True)

## Histograms

In [332]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook

In [333]:
# create dataframe for histogram
counts, buckets = np.histogram(df['rating'], bins=51, range=[50, 101])
ratings_hist_df = pd.DataFrame({'counts': counts, 'left': buckets[:-1], 'right': buckets[1:]})

# create blank plot
p = figure(plot_height=600, 
           plot_width=600, 
           title='Histogram of Ratings',
           x_axis_label='Rating', 
           y_axis_label='Number of Coffees')

# add data to plot
p.quad(bottom=0,
       top=ratings_hist_df['counts'],
       left=ratings_hist_df['left'],
       right=ratings_hist_df['right'],
       line_color='black')

# show the plot
show(p)

Overall ratings have a mode of 93.  The distribution exibits negative skew.

In [334]:
def create_0_to_10_hist(df, field: str, title: str):
    # create dataframe for histogram
    counts, buckets = np.histogram(df[field], bins=11, range=[0, 11])
    hist_df = pd.DataFrame({'counts': counts, 'left': buckets[:-1], 'right': buckets[1:]})

    # create blank plot
    p = figure(plot_height=600, 
               plot_width=600, 
               title=title,
               x_axis_label='Rating', 
               y_axis_label='Number of Coffees')

    # add data to plot
    p.quad(bottom=0,
           top=hist_df['counts'],
           left=hist_df['left'],
           right=hist_df['right'],
           line_color='black')

    # show the plot
    show(p)

In [335]:
create_0_to_10_hist(df, 'aroma', 'Histogram of Aroma')

In [336]:
create_0_to_10_hist(df, 'acidity_structure', 'Histogram of Acidity/Structure')

In [337]:
create_0_to_10_hist(df, 'body', 'Histogram of Body')

In [381]:
create_0_to_10_hist(df, 'flavor', 'Histogram of Flavor')

In [339]:
create_0_to_10_hist(df, 'aftertaste', 'Histogram of Aftertaste')

In [340]:
create_0_to_10_hist(df, 'with_milk', 'Histogram of With Milk')

Overall, most coffees are rated well.  Most scores have a high mode and negative skew.

## Look for Correlations

High correlations between:
* rating and everything
* aroma and everything except aftertaste and with_milk

With_Milk is the least correlated variable with everything else.

Body is also reasonably uncorrelated with a few variables.

In [341]:
df.corr()

Unnamed: 0,rating,aroma,acidity_structure,body,flavor,aftertaste,with_milk
rating,1.0,0.836389,0.840391,0.736726,0.909928,0.757848,0.718527
aroma,0.836389,1.0,0.61776,0.614621,0.769406,0.485051,0.477896
acidity_structure,0.840391,0.61776,1.0,0.423605,0.618982,0.611687,-0.112136
body,0.736726,0.614621,0.423605,1.0,0.670784,0.386878,0.451679
flavor,0.909928,0.769406,0.618982,0.670784,1.0,0.612305,0.523412
aftertaste,0.757848,0.485051,0.611687,0.386878,0.612305,1.0,0.282128
with_milk,0.718527,0.477896,-0.112136,0.451679,0.523412,0.282128,1.0


## How does roast level impact the overall rating?

Light roasts have the highest overall ratings, pretty much across the board.  Ratings tend to drop as the roast level gets darker.

In [345]:
df.groupby('roast_level').mean()

Unnamed: 0_level_0,rating,aroma,acidity_structure,body,flavor,aftertaste,with_milk
roast_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light,92.639604,8.720238,8.643357,8.396825,8.892644,8.171026,8.727273
Medium-Light,92.605859,8.721158,8.516571,8.409617,8.866429,8.132459,8.690608
Medium,90.987127,8.339213,8.203704,8.031886,8.467074,7.961453,8.515789
Medium-Dark,88.463479,7.883644,7.708333,7.533732,7.853856,7.664179,8.264151
Dark,86.0,7.552174,7.333333,7.247826,7.243478,7.361345,7.703125
Very Dark,85.253886,7.248705,7.0,7.222798,7.119171,6.955882,7.42


## Exploration of the text categories

In [360]:
sentence_len = [len(sentence.split()) for sentence in df['blind_assessment']]
max(sentence_len)

215

In [365]:
# create dataframe for histogram
counts, buckets = np.histogram(sentence_len, bins=111, range=[0, 221])
hist_df = pd.DataFrame({'counts': counts, 'left': buckets[:-1], 'right': buckets[1:]})

# create blank plot
p = figure(plot_height=600, 
           plot_width=600, 
           title='Histogram of Blind Assessment Sentence Lengths',
           x_axis_label='Rating', 
           y_axis_label='Count')

# add data to plot
p.quad(bottom=0,
       top=hist_df['counts'],
       left=hist_df['left'],
       right=hist_df['right'],
       line_color='black')

# show the plot
show(p)

In [369]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['blind_assessment'])
tensor = tokenizer.texts_to_sequences(df['blind_assessment']) 
tensor[0]

[77,
 6,
 106,
 182,
 9,
 176,
 24,
 79,
 189,
 3,
 10,
 1,
 5,
 6,
 14,
 35,
 11,
 115,
 13,
 135,
 12,
 44,
 61,
 8,
 78,
 27,
 15,
 7,
 9,
 14,
 176,
 1,
 24]

In [370]:
tokenizer_config = tokenizer.get_config()
tokenizer_config

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 6050,
 'word_counts': '{"crisply": 630, "sweet": 5560, "chocolaty": 451, "baking": 234, "chocolate": 5028, "raisin": 244, "cedar": 1538, "almond": 625, "lily": 221, "in": 10365, "aroma": 5010, "and": 13892, "cup": 5616, "toned": 3642, "structure": 1265, "with": 5006, "brisk": 415, "acidity": 3726, "velvety": 344, "mouthfeel": 4307, "gently": 985, "drying": 817, "finish": 5267, "consolidates": 629, "to": 1521, "notes": 2812, "of": 5422, "balanced": 1360, "bright": 839, "cocoa": 1438, "nib": 892, "orange": 952, "zest": 790, "brittle": 57, "plum": 268, "juicy": 798, "high": 323, "crisp": 1261, "quiet": 282, "that": 738, "leads": 166, "rounding": 125, "the": 10900, "long": 1530, "apricot": 574, "lemon": 832, "verbena": 131, "fresh": 836, "cut": 569, "fir": 468, "nougat": 83, "satiny": 590, "smooth": 877, "centers": 190, "around"

In [371]:
print("There are " + str(len(tokenizer_config['word_counts'])) + " unique words in the blind assessment dataset")

There are 86500 unique words in the blind assessment dataset


In [375]:
word_count_dict = ast.literal_eval(tokenizer_config['word_counts'])
type(word_count_dict)

dict

In [379]:
# Most Common English Words
Counter(word_count_dict).most_common(50)

[('and', 13892),
 ('the', 10900),
 ('in', 10365),
 ('a', 7096),
 ('cup', 5616),
 ('sweet', 5560),
 ('of', 5422),
 ('finish', 5267),
 ('chocolate', 5028),
 ('aroma', 5010),
 ('with', 5006),
 ('mouthfeel', 4307),
 ('acidity', 3726),
 ('toned', 3642),
 ('notes', 2812),
 ('fruit', 2619),
 ('but', 2337),
 ('rich', 2092),
 ('flowers', 2016),
 ('syrupy', 1761),
 ('dark', 1668),
 ('flavor', 1667),
 ('tart', 1629),
 ('cedar', 1538),
 ('long', 1530),
 ('as', 1523),
 ('to', 1521),
 ('cocoa', 1438),
 ('floral', 1422),
 ('like', 1403),
 ('sweetly', 1393),
 ('balanced', 1360),
 ('hint', 1352),
 ('richly', 1350),
 ('structure', 1265),
 ('crisp', 1261),
 ('milk', 1244),
 ('is', 1241),
 ('coffee', 1093),
 ('hints', 1049),
 ('lightly', 1032),
 ('by', 1016),
 ('into', 1010),
 ('gently', 985),
 ('deep', 984),
 ('complex', 982),
 ('pungent', 979),
 ('orange', 952),
 ('nut', 910),
 ('nib', 892)]

In [380]:
# Least common words
least_common = Counter(word_count_dict).most_common()[-50:]
least_common

[('visited', 1),
 ('recognize', 1),
 ('nostril', 1),
 ('amy', 1),
 ('bowser', 1),
 ('showcase', 1),
 ('violent', 1),
 ('markedly', 1),
 ('suggestive', 1),
 ('reassuringly', 1),
 ('model', 1),
 ('mandheling', 1),
 ('sagging', 1),
 ('boredom', 1),
 ('braun', 1),
 ('determined', 1),
 ('coding', 1),
 ('attic', 1),
 ('cellar', 1),
 ('confined', 1),
 ('pinched', 1),
 ('habanero', 1),
 ('flavorings', 1),
 ('gentleness', 1),
 ('polite', 1),
 ('permission', 1),
 ('getting', 1),
 ('upstaged', 1),
 ('dramatics', 1),
 ('zimbabwe', 1),
 ('disagreeably', 1),
 ('loath', 1),
 ('ignore', 1),
 ('deducts', 1),
 ('terrific', 1),
 ('bore', 1),
 ('boundaries', 1),
 ('darkness', 1),
 ('forgoing', 1),
 ('attentively', 1),
 ('energize', 1),
 ('pitched', 1),
 ('relegating', 1),
 ('passed', 1),
 ('spare', 1),
 ('opt', 1),
 ('chased', 1),
 ('whine', 1),
 ('constitutes', 1),
 ('increasing', 1)]