# Setup and imports

In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from numpy import linalg as LA
import json
%matplotlib inline
import matplotlib.pyplot as plt


import nltk
import re
from nltk import pos_tag, word_tokenize

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/donovan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/donovan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Tea reviews

In [3]:
tea_df = pd.read_csv("clean_data_no_reviews.csv")
tea_ids = set(tea_df['id'])

In [4]:
def get_flavors(tId):
    return list(tea_df[tea_df['id'] == tId]['flavors'])[0]

In [5]:
unfiltered_tea_revs = pd.read_csv("reviews.csv")
unfiltered_tea_revs['description'] = unfiltered_tea_revs['description'].apply(strip_tags)
tea_revs = unfiltered_tea_revs[unfiltered_tea_revs.id.isin(tea_ids)]

In [6]:
sorted_revs = tea_revs.sort_values(by=["id", "likes"], ascending=[True, False])

In [7]:
top_sorted_revs = sorted_revs.groupby('id').head(3)

In [8]:
top_sorted_revs.to_csv("clean_reviews.csv", encoding='utf-8', index=False)

In [9]:
def review_lookup(tea_id):
    return tea_revs[tea_revs['id'] == tea_id].sort_values(by='likes',ascending=False)

In [10]:
def reviews_by_id(tea_id):
    #return list(zip(review_lookup(tea_id)['description'].tolist(), review_lookup(tea_id)['ratingValue'].tolist()))
    return review_lookup(tea_id)['description'].tolist()

In [11]:
def top_reviews_by_id(tea_id):
    return reviews_by_id(tea_id)[:3]

In [12]:
def build_top_reviews():
    review_dict = {}
    for tea_id in tea_ids:
        review_dict[tea_id] = top_reviews_by_id(tea_id)
    return review_dict

In [13]:
reviews_dict = build_top_reviews()

# Tokenization and descriptors

In [14]:
tea_descriptions = tea_revs['description']
descriptions = []
i = 0
for desc in tea_descriptions:
    descriptions.append(desc)

In [15]:
def get_adj(tea_id):
    return list(set([t[0] for t in pos_tag(word_tokenize(" ".join(reviews_dict[tea_id]))) if t[1] == "JJ"]))

In [16]:
def get_adjectives(words):
    return set([t[0] for t in pos_tag(words) if t[1] == "JJ"])

In [17]:
def build_vectorizer(max_features, stop_words, max_df=0.4, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    v = TfidfVectorizer(stop_words=stop_words, max_df=max_df, min_df=min_df, 
                        max_features=max_features, norm=norm, use_idf=True, smooth_idf=True)
    return v

In [18]:
n_feats = 20000
tfidf_vec = build_vectorizer(n_feats, "english", min_df=70)
doc_by_vocab = tfidf_vec.fit_transform(d for d in descriptions).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}
vocab = {v for i, v in enumerate(tfidf_vec.get_feature_names())}

In [19]:
adjectives = get_adjectives(vocab)
remove = ["hot", "cold", "good", "bad", "don", 
          "nice", "little", "steep",  "sweet", 
          "leaf", "second", "shop", "smelled",
          "sugar",  "loose", "new", "sure", 
          "huge", "able", "great", "wasn", 
          "finish", "gift",
          "equivalent", "flavour",
          "imo",  "previous", "small", "old", "doesn",
          "interested", "isn", "red", "long", "attribute"]
for r in remove:
    if r in adjectives:
        adjectives.remove(r)

In [20]:
len(adjectives)

1187

In [21]:
adjectives = [adj for adj in adjectives if len(adj) > 2]

In [22]:
len(adjectives)

1160

In [23]:
def get_top_adj(id):
    toks = []
    for rev in reviews_by_id(id):
        toks.extend(word_tokenize(rev))
    return [t.capitalize() for (t,n) in Counter([w for w in toks if w in adjectives]).most_common(5)]
def get_top_adj_n(id, n):
    toks = []
    for rev in reviews_by_id(id):
        toks.extend(word_tokenize(rev))
    return [t.capitalize() for (t,n) in Counter([w for w in toks if w in adjectives]).most_common(n)]

In [24]:
def format_string(l):
    s = ""
    for w in l:
        s += w + ", "
    return s[:-2]

# Formatting Data

In [25]:
ids = list(reviews_dict.keys())

In [26]:
review1 = [("" if len(reviews_dict[tid]) < 1 else reviews_dict[tid][0]) for tid in ids]

In [27]:
review2 = [("" if len(reviews_dict[tid]) < 2 else reviews_dict[tid][1]) for tid in ids]

In [28]:
review3 = [("" if len(reviews_dict[tid]) < 3 else reviews_dict[tid][2]) for tid in ids]

In [29]:
features_flavors = [get_flavors(tid) + ", " + format_string(get_top_adj_n(tid, 5)) for tid in ids]

In [30]:
features = [format_string(get_top_adj(tid)) for tid in ids]

In [31]:
features[:10]

['Sour, Bag, Strong, Herbal, Acid',
 'Black, Bag, Grey, Strong, Favorite',
 'Green, Bag, Smooth, Strong, Bags',
 'Strong, Sour, Bag, Herbal, Favorite',
 'Black, Strong, Sweetness, Floral, Delicious',
 'Floral, Butter, Oolong, Cucumber, Light',
 'Floral, Oolong, Creamy, Green, Aroma',
 'Floral, Oolong, Soft, Green, Light',
 'Floral, Green, Oolong, Scent, Deep',
 'Black, Specific, Pleasant, Overall, Brew']

In [32]:
review_data = {'id': ids, 
               'review1': review1, 'review2': review2, 'review3': review3, 
               'features': features, 
               'features_flavors': features_flavors}
review_data_df = pd.DataFrame(data=review_data)

In [33]:
review_data_df.to_csv("features_data.csv", encoding='utf-8', index=False)

In [34]:
joined = tea_df.set_index('id').join(review_data_df.set_index('id'))

In [35]:
tea_types = ["Black", "Chai", "Flowering", "Food", "Fruit", "Green", "Guayusa", 
             "Herbal", "Honeybush", "Matcha", "Oolong", "Pu-Erh", "Rooibos", "White", "Yellow", "Yerba Maté"]

In [36]:
def add_tea_types(tea):
    tea_type = tea.split(" ")
    tea_type = [t.title() for t in tea_type]
    for a in tea_type:
        if a in tea_types:
            return ", " + a + " Tea"

In [37]:
joined['features_flavors'] = joined['features_flavors']  + joined['teaType'].apply(add_tea_types)

In [38]:
joined['features_flavors']

id
65303    Floral, Rainforest, Honeysuckle, Orchids, Pepp...
43095    Dark Chocolate, Chocolate, Dates, Malt, Orchid...
78801    Honey, Lemon, Nuts, Flowers, Lemongrass, Stunn...
48058    Cocoa, Dark Chocolate, Malt, Vanilla, Apple, A...
32444    Sweet, Mineral, Nutty, Roasted, Salty, Butter,...
22442    Apricot, Creamy, Mango, Milk, Smooth, Stonefru...
78800    Broccoli, Cut grass, Nutty, Olives, Freshly Cu...
20060    Citrus Zest, Cream, Toasted Rice, Tobacco, Sug...
28779    Tea, Chocolate, Cream, Cocoa, Fruity, Malt, Br...
41785    Chocolate, Cocoa, Malt, Melon, Plums, Sweet Po...
21629    Apricot, Dust, Flowers, Honey, Peach, Chocolat...
40986    Orange, Chocolate, Orange Zest, Creamy, Real, ...
35881    Chocolate, Cocoa, Nutty, Pastries, Roasted, To...
8188     Honey Dew, Green Melons, Honeydew, Melon, Roas...
48327    Cinnamon, Nutmeg, Spices, Stonefruits, Candy, ...
38774    Brown Sugar, Cinnamon, Molasses, Mushrooms, Va...
19426    Flowers, Grapes, Vegetal, White, Creamy, Lig

In [39]:
joined.to_csv("complete.csv", encoding='utf-8')