# Setup and imports

In [101]:
import pandas as pd
import numpy as np
import random
from collections import Counter
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from numpy import linalg as LA
import json
%matplotlib inline
import matplotlib.pyplot as plt


import nltk
import re
from nltk import pos_tag, word_tokenize

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/benstevens/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benstevens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [75]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Tea reviews

In [76]:
tea_df = pd.read_csv("scraper/clean_data.csv")
tea_ids = set(tea_df['id'])

In [77]:
def get_flavors(tId):
    return list(tea_df[tea_df['id'] == tId]['flavors'])[0]

In [78]:
unfiltered_tea_revs = pd.read_csv("reviews.csv")
unfiltered_tea_revs['description'] = unfiltered_tea_revs['description'].apply(strip_tags)
tea_revs = unfiltered_tea_revs[unfiltered_tea_revs.id.isin(tea_ids)]

In [79]:
sorted_revs = tea_revs.sort_values(by=["id", "likes"], ascending=[True, False])

In [80]:
top_sorted_revs = sorted_revs.groupby('id').head(3)

In [81]:
top_sorted_revs.to_csv("clean_reviews.csv", encoding='utf-8', index=False)

In [82]:
def review_lookup(tea_id):
    return tea_revs[tea_revs['id'] == tea_id].sort_values(by='likes',ascending=False)

In [83]:
def reviews_by_id(tea_id):
    #return list(zip(review_lookup(tea_id)['description'].tolist(), review_lookup(tea_id)['ratingValue'].tolist()))
    return review_lookup(tea_id)['description'].tolist()


In [84]:
def top_reviews_by_id(tea_id):
    return reviews_by_id(tea_id)[:3]

In [85]:
def build_top_reviews():
    review_dict = {}
    for tea_id in tea_ids:
        review_dict[tea_id] = top_reviews_by_id(tea_id)
    return review_dict

In [86]:
reviews_dict = build_top_reviews()

# Tokenization and descriptors

In [87]:
tea_descriptions = tea_revs['description']
descriptions = []
i = 0
for desc in tea_descriptions:
    descriptions.append(desc)

In [88]:
def get_adj(tea_id):
    return list(set([t[0] for t in pos_tag(word_tokenize(" ".join(reviews_dict[tea_id]))) if t[1] == "JJ"]))

In [89]:
def get_adjectives(words):
    return set([t[0] for t in pos_tag(words) if t[1] == "JJ"])

In [107]:
def build_vectorizer(max_features, stop_words, max_df=0.4, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    # YOUR CODE HERE
    v = TfidfVectorizer(stop_words=stop_words, max_df=max_df, min_df=min_df, max_features=max_features, norm=norm, use_idf=True, smooth_idf=True)
    return v
    
    
    raise NotImplementedError()

In [108]:
n_feats = 20000
tfidf_vec = build_vectorizer(n_feats, "english", min_df=70)
doc_by_vocab = tfidf_vec.fit_transform(d for d in descriptions).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}
vocab = {v for i, v in enumerate(tfidf_vec.get_feature_names())}

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [113]:
len(adjectives)

1180

In [165]:
adjectives = get_adjectives(vocab)
remove = ["hot", "cold", "good", "bad", "don", 
          "nice", "little", "steep",  "sweet", 
          "leaf", "second", "shop", "smelled",
          "sugar",  "loose", "new", "sure", 
         "huge", "able", "great", "wasn", 
          "finish", "gift",
           "equivalent", "flavour",
         "imo",  "previous", "small", "old", "doesn",
           "interested", "isn", "red", "long", "attribute"]
for r in remove:
    adjectives.remove(r)

In [166]:
def get_top_adj(id):
    toks = []
    for rev in reviews_by_id(id):
        toks.extend(word_tokenize(rev))
    return [t.capitalize() for (t,n) in Counter([w for w in toks if w in adjectives]).most_common(5)]
def get_top_adj_n(id, n):
    toks = []
    for rev in reviews_by_id(id):
        toks.extend(word_tokenize(rev))
    return [t.capitalize() for (t,n) in Counter([w for w in toks if w in adjectives]).most_common(n)]

In [167]:
def format_string(l):
    s = ""
    for w in l:
        s += w + ", "
    return s[:-2]

# Formatting Data

In [146]:
ids = list(reviews_dict.keys())

In [147]:
review1 = [("" if len(reviews_dict[tid]) < 1 else reviews_dict[tid][0]) for tid in ids]

In [148]:
review2 = [("" if len(reviews_dict[tid]) < 2 else reviews_dict[tid][1]) for tid in ids]

In [149]:
review3 = [("" if len(reviews_dict[tid]) < 3 else reviews_dict[tid][2]) for tid in ids]

In [150]:
features_flavors = [get_flavors(tid) + ", " + format_string(get_top_adj_n(tid, 10)) for tid in ids]

In [168]:
features = [format_string(get_top_adj(tid)) for tid in ids]

In [169]:
features

['Sour, Tart, Strong, Herbal, Right',
 'Black, Bitter, Grey, Standard, Strong',
 'Green, Bitter, Strong, Light, Floral',
 'Tart, Strong, Sour, Herbal, Delicious',
 'Floral, Middle, Subtle, Apparent, Cream',
 'Black, Strong, Bold, Depth, Delicious',
 'Floral, Green, Deep, Unique, Dry',
 'Floral, Creamy, Green, Cream, Frosting',
 'Floral, Soft, Green, Cream, Light',
 'Floral, Green, Strong, Creamy, Grassy',
 'Green, Light, Dry, Fresh, Gongfu',
 'Green, Floral, Deep, Delicious, Fresh',
 'Strong, Herbal, Caffeine, Light, Mint',
 'Black, Floral, Light, Dry, Brown',
 'Black, List, Half, Light, Soft',
 'Surprised, Flush, Mild, Local, Cafe',
 'Black, Strong, Decaf, Grey, Sachet',
 'Creamer, Odd, Appropriate, Gentle, Funny',
 'Mint, Herbal, Tropical, Medicinal, Surprising',
 'Tangy, Tart, Big, Light, Bright',
 'Black, Strong, Bitter, Assam, Perfect',
 'Bitter, Different, Bonus, Excessive, Infuser',
 'Mango, Delicious, Black, Bitter, Free',
 'White, Light, Needle, Subtle, Floral',
 'Black, Bitte

In [170]:
review_data = {'id': ids, 'review1': review1, 'review2': review2, 'review3': review3, 'features': features, 'features_flavors': features_flavors}
review_data_df = pd.DataFrame(data=review_data)

In [495]:
joined = tea_df.set_index('id').join(review_data_df.set_index('id'))


In [171]:
review_data_df.to_csv("features_data.csv", encoding='utf-8', index=False)

In [102]:
features

NameError: name 'features' is not defined