In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
import requests
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import pairwise_distances

In [71]:
with open('coffee_words.pickle','rb') as read_file:
    coffee = pickle.load(read_file)
with open('coffee_ratings.pickle','rb') as read_file:
    ratings = pickle.load(read_file)
with open('combined.pickle','rb') as read_file:
    combined = pickle.load(read_file)
with open('df.pickle','rb') as read_file:
    df = pickle.load(read_file)
with open('df_topic_breakdown.pickle','rb') as read_file:
    df_topic_breakdown = pickle.load(read_file)
with open('sentiment.pickle','rb') as read_file:
    sentiment = pickle.load(read_file)
ratings = ratings.reset_index().rename(columns={'index':'Roaster'})

In [7]:
from nltk.corpus import stopwords
sw = stopwords.words("english")
sw = sw + ['coffee','coffees','cup','john', 'diruocco','jen','apodaca','ken','kevin','keurig','espresso','serve','capsule','device','serving','flavor','notes','mouthfeel','aroma','finish','brewed','brewing','parts','one','two','three','evaluate','evaluated','hint']

In [8]:
blindtfidf = TfidfVectorizer(min_df=10, stop_words = sw)
doc_word = blindtfidf.fit_transform(coffee.Review.str.replace(r'\d+','',regex=True))

nmf_model = NMF(9)
doc_topic = nmf_model.fit_transform(doc_word)

topic_word = nmf_model.components_

words = blindtfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['flowers', 'honey', 'silky', 'acidity', 'like', 'bright'],
 ['chocolate', 'dark', 'cedar', 'milk', 'small', 'chocolaty'],
 ['structure', 'tart', 'sweet', 'zest', 'richly', 'savory'],
 ['cocoa', 'toned', 'powder', 'nib', 'cedar', 'structure'],
 ['fresh', 'cut', 'fir', 'lightly', 'syrupy', 'drying'],
 ['cacao', 'nib', 'roasted', 'drying', 'lively', 'juicy'],
 ['black', 'currant', 'cherry', 'savory', 'red', 'pungent'],
 ['wood', 'body', 'aromatic', 'nut', 'sweetness', 'rather'],
 ['fruit', 'toned', 'cherry', 'chocolate', 'sweet', 'rich']]

In [53]:
coffee.iloc[10].Review

'Deeply pungent, sweetly savory. Dark chocolate, narcissus, black cherry, cardamom, cashew in aroma and cup. Sweet-savory structure with roundly tart acidity; full, creamy mouthfeel. The floral-toned finish leads with notes of narcissus, balanced by dark chocolate and cashew underneath. '

In [55]:
indices = pairwise_distances(doc_topic[10].reshape(1,-1),doc_topic,metric='cosine').argsort()
recs = list(indices[0][-3:])
df_topic_breakdown.iloc[recs]

coffee.iloc[recs[0]].Review

'Soft, round aroma: muted notes of lemon and flowers. In the cup balanced, gently tart acidity, lightish body, silky mouthfeel, continuing lemon and flowers. Some flavor carries into a clean, simple finish.'

In [90]:
t = ['Delicate, lyrically sweet, gently tart. Tea rose, pink grapefruit zest, cocoa nib, fresh-cut oak, wild honey in aroma and cup. Sweet structure with gently bright acidity; plush, satiny mouthfeel. The finish consolidates to richly sweet notes of tea rose and honey with cocoa nib undertones.']
vt = blindtfidf.transform(t).todense()
tt1 = nmf_model.transform(vt)
tt1

array([[0.03285689, 0.        , 0.0692112 , 0.08157737, 0.0558586 ,
        0.03514688, 0.        , 0.        , 0.        ]])

In [91]:
indices = pairwise_distances(tt1.reshape(1,-1),doc_topic,metric='cosine').argsort()
recs = list(indices[0][0:4])
df_topic_breakdown.iloc[recs]
print('The coffee you liked was described as:',t[0])
print('\n')
print('Based on your input coffee, I recommend you try the',ratings.iloc[recs[0]]['Roast Level'],'roasted',ratings.iloc[recs[0]]['Coffee Origin'],'by',ratings.iloc[recs[0]]['Roaster'],'.','\n','It could be desribed as:',coffee.iloc[recs[0]].Review)

The coffee you liked was described as: Delicate, lyrically sweet, gently tart. Tea rose, pink grapefruit zest, cocoa nib, fresh-cut oak, wild honey in aroma and cup. Sweet structure with gently bright acidity; plush, satiny mouthfeel. The finish consolidates to richly sweet notes of tea rose and honey with cocoa nib undertones.


Based on your input coffee, I recommend you try the Medium-Light roasted Huila, Colombia by Badbeard’s Microroastery . 
 It could be desribed as: Sweetly tart, floral-driven. Honeysuckle, vanilla bean, cocoa nib, peach, fresh-cut oak in aroma and cup. Crisply sweet structure with gently bright, ripe acidity; very full, viscous mouthfeel. The finish consolidates to notes of peach, vanilla bean and oak.


In [88]:
t = [coffee.iloc[recs[0]].Review]
vt = blindtfidf.transform(t).todense()
tt2 = nmf_model.transform(vt)
tt2

array([[0.02186967, 0.00078154, 0.04314666, 0.01241035, 0.00014772,
        0.00245299, 0.        , 0.        , 0.03365916]])

In [89]:
pairwise_distances(tt1.reshape(1,-1),tt2.reshape(1,-1),metric='cosine')

array([[0.00301434]])