## Topic Explorations/Visualization/Recommendations

This notebook is intended for better understanding the topics found in the NMF modeling, their use in recommendations, and how groups compare with each other.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
import requests
import pickle
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
with open('coffee_words.pickle','rb') as read_file:
    coffee = pickle.load(read_file)
with open('coffee_ratings.pickle','rb') as read_file:
    ratings = pickle.load(read_file)
with open('combined.pickle','rb') as read_file:
    combined = pickle.load(read_file)
with open('df.pickle','rb') as read_file:
    df = pickle.load(read_file)
with open('df_topic_breakdown.pickle','rb') as read_file:
    df_topic_breakdown = pickle.load(read_file)

In [6]:
ratings = ratings.reset_index().rename(columns={'index':'Roaster'})

In [22]:
df_topic_breakdown.group.value_counts()

7    1266
2     983
1     860
8     737
0     492
4     441
3     429
5     421
6     330
Name: group, dtype: int64

In [23]:
df_topic_breakdown['origin_general']=df_topic_breakdown.origin.str.split(', ').str[-1]
df_topic_breakdown.head(2)

Unnamed: 0,roaster,origin,roast_level,rating,length,word count,group,bright_floral_citrus,choc_woody_dark,tart_sweet_smooth,cacao_nut_clean,sweet_nut_pine,juicy_cacao_honey,red_berries,woody_nut_caramel,cherry_vinuous_choc,date,aroma,body,flavor,aftertaste,acidity,origin_general
0,Jackrabbit Java,Costa Rica,Medium-Light,93,257,24,4,0.0,0.045379,0.073064,0.0,0.104257,0.0,0.010795,0.0,0.0,February 2021,9,9,9,8,8,Costa Rica
1,Jackrabbit Java,"Nyamasheke District, Rwanda",Medium-Light,92,248,26,2,0.01818,0.0,0.073098,0.063934,0.0,0.009512,0.0,0.00635,0.012827,February 2021,9,8,9,8,8,Rwanda


In [24]:
topic_features = ['bright_floral_citrus', 'choc_woody_dark', 'tart_sweet_smooth','cacao_nut_clean', 'sweet_nut_pine', 'juicy_cacao_honey', 'red_berries','woody_nut_caramel', 'cherry_vinuous_choc']
df_topic_breakdown.groupby(by='origin_general')[topic_features].mean().sample(3)

Unnamed: 0_level_0,bright_floral_citrus,choc_woody_dark,tart_sweet_smooth,cacao_nut_clean,sweet_nut_pine,juicy_cacao_honey,red_berries,woody_nut_caramel,cherry_vinuous_choc
origin_general,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Central Yemen,0.004864,0.006302,0.037899,0.002672,0.001141,0.004741,0.024173,0.011738,0.060668
Africa; Asia Pacific,0.017403,0.061002,0.054237,0.000745,0.003081,0.00333,0.0,0.0,0.043586
Java,0.0,0.010941,0.0,0.004782,0.002674,0.0,0.0,0.03541,0.017612


In [25]:
df_topic_breakdown

Unnamed: 0,roaster,origin,roast_level,rating,length,word count,group,bright_floral_citrus,choc_woody_dark,tart_sweet_smooth,cacao_nut_clean,sweet_nut_pine,juicy_cacao_honey,red_berries,woody_nut_caramel,cherry_vinuous_choc,date,aroma,body,flavor,aftertaste,acidity,origin_general
0,Jackrabbit Java,Costa Rica,Medium-Light,93,257,24,4,0.000000,0.045379,0.073064,0.000000,0.104257,0.000000,0.010795,0.000000,0.000000,February 2021,9,9,9,8,8,Costa Rica
1,Jackrabbit Java,"Nyamasheke District, Rwanda",Medium-Light,92,248,26,2,0.018180,0.000000,0.073098,0.063934,0.000000,0.009512,0.000000,0.006350,0.012827,February 2021,9,8,9,8,8,Rwanda
2,Red Rooster Coffee Roaster,"Los Naranjos, La Argentina, Huila Department, ...",Light,96,365,32,2,0.017113,0.000000,0.077220,0.000000,0.027378,0.000000,0.007769,0.003145,0.022832,February 2021,9,9,10,9,9,Colombia
3,Paradise Roasters,"Huila, Colombia",Light,95,268,27,2,0.008159,0.000000,0.143630,0.006424,0.000000,0.000000,0.000000,0.000000,0.005296,February 2021,9,9,9,9,9,Colombia
4,Kakalove Cafe,"Antioquia Department, Colombia",Medium-Light,95,261,29,6,0.000000,0.037483,0.074243,0.009381,0.000000,0.000000,0.114548,0.000000,0.053789,February 2021,9,9,9,9,9,Colombia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5954,The Coffee Beanery,,Medium-Light,83,209,15,5,0.006830,0.000000,0.000000,0.000000,0.000000,0.057304,0.000553,0.045314,0.000000,February 1997,8,7,7,,7,
5955,Starbucks Coffee,,Dark,81,227,15,7,0.000000,0.013141,0.000000,0.000000,0.000000,0.025383,0.003274,0.042684,0.000000,February 1997,7,6,8,,5,
5956,Peerless Coffee,,Medium,75,266,18,7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002932,0.048215,0.000000,February 1997,6,6,5,,5,
5957,Gevalia,,Light,74,297,24,7,0.007255,0.000171,0.000000,0.000000,0.000000,0.000000,0.000000,0.065328,0.006085,February 1997,7,6,6,,8,


In [17]:
## Creating a more useful data set to export to Tableau for additional EDA and visualizations

tableau = df_topic_breakdown
tableau['date'] = ratings['Review Date']
tableau['aroma'] = ratings.Aroma
tableau['body'] = ratings.Body
tableau['flavor'] = ratings.Flavor
tableau['aftertaste'] = ratings.Aftertaste
tableau['acidity'] = ratings.Acidity
tableau = tableau.reset_index().rename(columns={'index':'coffee_id'})
tableau.to_csv(path_or_buf = r'C:\Users\ejfel\tableau.csv')
tableau.head()

Unnamed: 0,coffee_id,roaster,origin,roast_level,rating,length,word count,group,bright_floral_citrus,choc_woody_dark,tart_sweet_smooth,cacao_nut_clean,sweet_nut_pine,juicy_cacao_honey,red_berries,woody_nut_caramel,cherry_vinuous_choc,date,aroma,body,flavor,aftertaste,acidity
0,0,Jackrabbit Java,Costa Rica,Medium-Light,93,257,24,4,0.0,0.045379,0.073064,0.0,0.104257,0.0,0.010795,0.0,0.0,February 2021,9,9,9,8,8
1,1,Jackrabbit Java,"Nyamasheke District, Rwanda",Medium-Light,92,248,26,2,0.01818,0.0,0.073098,0.063934,0.0,0.009512,0.0,0.00635,0.012827,February 2021,9,8,9,8,8
2,2,Red Rooster Coffee Roaster,"Los Naranjos, La Argentina, Huila Department, ...",Light,96,365,32,2,0.017113,0.0,0.07722,0.0,0.027378,0.0,0.007769,0.003145,0.022832,February 2021,9,9,10,9,9
3,3,Paradise Roasters,"Huila, Colombia",Light,95,268,27,2,0.008159,0.0,0.14363,0.006424,0.0,0.0,0.0,0.0,0.005296,February 2021,9,9,9,9,9
4,4,Kakalove Cafe,"Antioquia Department, Colombia",Medium-Light,95,261,29,6,0.0,0.037483,0.074243,0.009381,0.0,0.0,0.114548,0.0,0.053789,February 2021,9,9,9,9,9


## Polar/Spider Graphs

Since each coffee is turned into a nine-dimensional flavor vector using NMF, I want to visually compare these score assignments across coffees.

In [19]:
# !pip install plotly==4.14.3
import plotly.express as px
import pandas as pd

Collecting plotly==4.14.3
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11435 sha256=a3732790d7192999e97797555a732243d52a25fe55a11da698b12ceab620ae12
  Stored in directory: c:\users\ejfel\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.14.3 retrying-1.3.3


The work in the next cell finds the average NMF vector for a coffee that would be given each of the nine topic assignments, if majority asssignment was used. I ran this cell for each of the nine topics to generate nine visualizations to create a GIF of the different average vectors.

In [132]:
topic_features = ['bright_floral_citrus', 'choc_woody_dark', 'tart_sweet_smooth','cacao_nut_clean', 'sweet_nut_pine', 'juicy_cacao_honey', 'red_berries','woody_nut_caramel', 'cherry_vinuous_choc']
topics = ['Bright, Floral, Citrus', 'Chocolate, Dark, Woody', 'Tart, Sweet, Smooth','Cacao, Nutty, Clean', 'Sweet, Nut, Pine', 'Juicy, Honey, Cacao', 'Red Berries','Nutty, Caramel, Woody', 'Cherry, Vinuous, Chocolate']

list(tableau[tableau.group == 0][topic_features].mean())
spider = pd.DataFrame(dict(
    r=list(tableau[tableau.group == 0][topic_features].mean()),
    theta=topics))
fig = px.line_polar(spider, r='r', theta='theta', line_close=True)
fig.update_layout(
        title = {
            'text':'Mean NMF vector',
            'y':.95,
            'x':.5,
            'xanchor':'center',
            'yanchor':'top'},
        legend_title="NMF Topics",
  polar=dict(
    radialaxis=dict(
      visible=False,
      range=[0, .12]
    )),
  showlegend=False)
fig.show()

Same ideas as above, but seeing all nine vectors simultaneously on one plot. This was not as clear as the GIF made using the above images.

In [93]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


categories = ['bright_floral_citrus', 'choc_woody_dark', 'tart_sweet_smooth','cacao_nut_clean', 'sweet_nut_pine', 'juicy_cacao_honey', 'red_berries','woody_nut_caramel', 'cherry_vinuous_choc']
topics = ['Bright, Floral, Citrus', 'Chocolate, Dark, Woody', 'Tart, Sweet, Smooth','Cacao, Nutty, Clean', 'Sweet, Nut, Pine', 'Juicy, Honey, Cacao', 'Red Berries','Nutty, Caramel, Woody', 'Cherry, Vinuous, Chocolate']


fig = go.Figure()

for i in range(0,9):
    fig.add_trace(go.Scatterpolar(
          r=list(tableau[tableau.group == i][categories].mean()),
          theta=topics,
          fill='toself',
          name=topics[i],
        opacity = .5,
    ))
# fig.add_trace(go.Scatterpolar(
#       r=list(tableau[tableau.group == 1][categories].mean()),
#       theta=categories,
#       fill='toself',
#       name=topics[1]
# ))

fig.update_layout(
        title = {
            'text':'Mean NMF vector by topic',
            'y':.9,
            'x':.5,
            'xanchor':'center',
            'yanchor':'top'},
        legend_title="NMF Topics",
  polar=dict(
    radialaxis=dict(
      visible=False,
      range=[0, .12]
    )),
  showlegend=False
)
# fig.write_image(r'C:\Users\ejfel\Documents\metis_repos\Coffee-Reviews-NLP\Visuals\spider.png')
fig.show()



## Analyzing a recommendation

I also wanted to show, visually, the similarity between a given coffee and it's recommendation, using my recommendation pipeline. The vectors below came from a coffee that was reviewed after I scraped the original corpus (Souvenier Coffee's Costa Rica Cloza Estate) and it most similar comparison in the review set (Small Eyes Cafe Santa Barbara, Honduras). 

In [148]:
example_comps = [[0.00387805, 0.01173347, 0.08920644, 0.03266696, 0.01315394, 0., 0.02038074, 0.02119388, 0.00695792],[0.00036605, 0., 0.09780715, 0.02406052, 0.00661351, 0. , 0.02649725, 0.03088748, 0.00711122]]
example_comps[0]
names = ['Santa Barbara, Honduras by Small Eyes Cafe','Costa Rica Cloza Estate by Souvenir Coffee']
categories = ['bright_floral_citrus', 'choc_woody_dark', 'tart_sweet_smooth','cacao_nut_clean', 'sweet_nut_pine', 'juicy_cacao_honey', 'red_berries','woody_nut_caramel', 'cherry_vinuous_choc']
topics = ['Bright, Floral, Citrus', 'Chocolate, Dark, Woody', 'Tart, Sweet, Smooth','Cacao, Nutty, Clean', 'Sweet, Nut, Pine', 'Juicy, Honey, Cacao', 'Red Berries','Nutty, Caramel, Woody', 'Cherry, Vinuous, Chocolate']


fig = go.Figure()

for i in range(0,2):
    fig.add_trace(go.Scatterpolar(
          r=example_comps[i],
          theta=topics,
          fill=None,
          name=names[i],
        opacity = .5,
    ))
# fig.add_trace(go.Scatterpolar(
#       r=list(tableau[tableau.group == 1][categories].mean()),
#       theta=categories,
#       fill='toself',
#       name=topics[1]
# ))

fig.update_layout(
        title = {
            'text':'Visualizing a comparison',
            'y':.9,
            'x':.5,
            'xanchor':'center',
            'yanchor':'top'},
        legend_title="Comparison Coffees",
  polar=dict(
    radialaxis=dict(
      visible=False,
      range=[0, .12]
    )),
  showlegend=True
)
# fig.write_image(r'C:\Users\ejfel\Documents\metis_repos\Coffee-Reviews-NLP\Visuals\spider.png')
fig.show()



In the above visual, I can see that not only do these two coffees appear to have a very similar majority assignment score (to Tart, Sweet, and Smooth), but their respective distributions across the other dimensions appear to be in similar amounts to the same components. 

## The below work is ongoing attempts for other visualizations

In [110]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'polar'}]*2]*2)

fig.add_trace(go.Scatterpolar(
      name = "angular categories",
      r = [5, 4, 2, 4, 5],
      theta = ["a", "b", "c", "d", "a"],
    ), 1, 1)
fig.add_trace(go.Scatterpolar(
      name = "radial categories",
      r = ["a", "b", "c", "d", "b", "f", "a"],
      theta = [1, 4, 2, 1.5, 1.5, 6, 5],
      thetaunit = "radians",
    ), 1, 2)
fig.add_trace(go.Scatterpolar(
      name = "angular categories (w/ categoryarray)",
      r = [5, 4, 2, 4, 5],
      theta = ["a", "b", "c", "d", "a"],
    ), 2, 1)
fig.add_trace(go.Scatterpolar(
      name = "radial categories (w/ category descending)",
      r = ["a", "b", "c", "d", "b", "f", "a", "a"],
      theta = [45, 90, 180, 200, 300, 15, 20, 45],
    ), 2, 2)

fig.update_traces(fill='toself')
fig.update_layout(
    polar = dict(
      radialaxis_angle = -45,
      angularaxis = dict(
        direction = "clockwise",
        period = 6)
    ),
    polar2 = dict(
      radialaxis = dict(
        angle = 180,
        tickangle = -180 # so that tick labels are not upside down
      )
    ),
    polar3 = dict(
      sector = [80, 400],
      radialaxis_angle = -45,
      angularaxis_categoryarray = ["d", "a", "c", "b"]
    ),
    polar4 = dict(
      radialaxis_categoryorder = "category descending",
      angularaxis = dict(
        thetaunit = "radians",
        dtick = 0.3141592653589793
      ))
)

fig.show()

ValueError: arrays must all be same length