# Setup and imports

In [1]:
import pandas as pd
import numpy as np
import random
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import linalg as LA
import json
%matplotlib inline
import matplotlib.pyplot as plt


import nltk
import re
from nltk import pos_tag, word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/benstevens/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benstevens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Tea reviews

In [3]:
tea_df = pd.read_csv("scraper/clean_data.csv")
tea_ids = set(tea_df['id'])

In [4]:
unfiltered_tea_revs = pd.read_csv("reviews.csv")
unfiltered_tea_revs['description'] = unfiltered_tea_revs['description'].apply(strip_tags)
tea_revs = unfiltered_tea_revs[unfiltered_tea_revs.id.isin(tea_ids)]

In [5]:
sorted_revs = tea_revs.sort_values(by=["id", "likes"], ascending=[True, False])

In [6]:
top_sorted_revs = sorted_revs.groupby('id').head(3)

In [7]:
top_sorted_revs.size

237006

In [8]:
top_sorted_revs.to_csv("clean_reviews.csv", encoding='utf-8', index=False)

In [48]:
def review_lookup(tea_id):
    return tea_revs[tea_revs['id'] == tea_id].sort_values(by='likes',ascending=False)

In [224]:
def reviews_by_id(tea_id):
    return review_lookup(tea_id)['description'].tolist()

In [225]:
def top_reviews_by_id(tea_id):
    return reviews_by_id(tea_id)[:3]

In [226]:
def build_top_reviews():
    review_dict = {}
    for tea_id in tea_ids:
        review_dict[tea_id] = top_reviews_by_id(tea_id)
    return review_dict

In [228]:
reviews_dict = build_top_reviews()

# Tokenization and descriptors

In [255]:
tea_descriptions = tea_revs['description']
descriptions = []
i = 0
for desc in tea_descriptions:
    descriptions.append(desc)

In [246]:
def get_adj(tea_id):
    return list(set([t[0] for t in pos_tag(word_tokenize(" ".join(reviews_dict[tea_id]))) if t[1] == "JJ"]))

In [61]:
def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    # YOUR CODE HERE
    v = TfidfVectorizer(stop_words=stop_words, max_df=max_df, min_df=min_df, max_features=max_features, norm=norm, use_idf=True, smooth_idf=True)
    return v
    
    
    raise NotImplementedError()

In [261]:
n_feats = 5000
tfidf_vec = build_vectorizer(n_feats, "english")
doc_by_vocab = tfidf_vec.fit_transform(d for d in descriptions).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}
vocab = {v for i, v in enumerate(tfidf_vec.get_feature_names())}
adjectives = get_adjectives(vocab)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [258]:
def get_adjectives(words):
    return set([t[0] for t in pos_tag(words) if t[1] == "JJ"])

In [260]:
adjectives

{'consume',
 'mrs',
 'distracted',
 'bunny',
 'ride',
 'gratitude',
 'sencha',
 'clumps',
 'asian',
 'burnt',
 'eta',
 'david',
 'influenced',
 'alishan',
 'felt',
 'sororitea',
 'busy',
 'bamboo',
 'scent',
 'ebay',
 'moist',
 'cap',
 'necessary',
 'poor',
 'cool',
 'lime',
 'musty',
 'february',
 'update',
 'ridiculous',
 'rare',
 'accessible',
 'curious',
 'distinctive',
 'brendan',
 'impossible',
 'ive',
 'sap',
 'steap',
 'asleep',
 'noticeable',
 'underneath',
 'gorgeous',
 'yabao',
 'decent',
 'mon',
 'fei',
 'human',
 'fifth',
 'expectation',
 'toasty',
 'fennel',
 'ttc',
 'drinkable',
 'useful',
 'indulgent',
 'honeybush',
 'unpleasant',
 'tahitian',
 'regularly',
 'bergamot',
 'delivered',
 'moonlight',
 'fantastic',
 'splendid',
 'premium',
 'cakey',
 'tricky',
 'alcoholic',
 'disappointing',
 'herbaceous',
 'mirk',
 'bronze',
 'english',
 'goodness',
 'potential',
 'cinnamon',
 'idk',
 'bizarre',
 'wednesday',
 'narrow',
 'generous',
 'bright',
 'extraordinary',
 'gal',
 'h