In [1]:
#Importing the packages needed for this analysis
import pandas as pd
import numpy as np
import math
from scipy import stats
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from IPython.core.interactiveshell import InteractiveShell

%matplotlib inline
%config InlineBackend.figure_format = 'png'
InteractiveShell.ast_node_interactivity = "all"
plt.rcParams['figure.dpi']= 400

In [2]:
import json
from pymongo import MongoClient

client = MongoClient()
"""db = client.legislation
tweets = db.news"""

'db = client.legislation\ntweets = db.news'

In [None]:
tastingflavors = {'Spicy':['cocoa','clove', 'vanilla','pepper', 'saffron','nutmeg','licorice','menthol','cinnamon']\
'Char':['ash','tar','toast','wood smoke','tobacco','fireplace','burnt food','grilled food']\
'Sweet':['malt','brown sugar','candy','honey','caramel','molasses','burnt sugar','']}

In [3]:
#reading a pickle file reading to pick up where i left off in case something fails or i start over
with open("tea_data.pkl", 'rb') as picklefile: 
    teareview_dict = pickle.load(picklefile)

In [4]:
reviewcount = []
count =0
for i in teareview_dict:
    count=0
    for j in teareview_dict[i]: 
        count+=len(j['Tea Reviews'])
    reviewcount.append(count)


In [5]:
sum(reviewcount)


6931

## More Data Wrangling

Now that I have the dataframe imported, I am going to use NLP to work with the different reviews on certain teas to create more insights and set it up for unsupervised learning.

### Tasting Considerations
* Aroma: The odor of the tea liquor, also called the nose or fragrance. A complex aroma is often described as a bouquet. 
* Astringency: A lively and mouth-drying effect on the tongue. Not bitter, but a clean and refreshing quality. The sensation of astringency is caused by a reaction between polyphenols (tannins) and the protein in saliva. 
* Body: The tactile aspect of tea’s weight and substance in the mouth, variously subcategorized as light, medium, or full; also known as fullness. 
* Bright: A lively, clean style that refreshes the palate. 
* Character: A tea’s signature attributes depending upon origin, whether of its country, region or type. 
* Clean: Indicates purity of flavor and an absence of any off-tastes. 
* Finish: The lasting taste on your tongue after swallowing the tea. 
* Flowery: A floral nose or flavor associated with high grade teas. 
* Full: References a positive sensation of body and good heft; indicates a well-made tea, possessing color, strength, substance and roundness. 
* Malty: A sweet malt flavor that is characteristic of Assam black teas. 
* Muscatel: A flavor reminiscent of grapes, most often used to describe an exceptional characteristic found in the liquors of the finest Darjeelings. 
* Smooth: Round-bodied, fine-drinking teas. 
* Soft: Smooth, lush, and subsequently often (but not necessarily) timid in flavor; not a negative term. 
* Thick: Describes liquor having substance, but not necessarily strength. 
* Vegetal: A characteristic of green teas that might include grassy, herby or marine flavors.

In [6]:
import nltk
import re

### Polarity Score
Using TextBlob, I will be creating a polarity score for each review.  this will 

In [8]:
polarityscore = []
for i in teareview_dict:
    for j in teareview_dict[i]: 
        for review in j['Tea Reviews']:
            q = TextBlob(review)
            polarityscore.append(q.sentiment.polarity)
        j['Polarity']=polarityscore
        polarityscore=[]

In [9]:
teareview_dict['Black Tea'][0]['Polarity']

[0.24780701754385964,
 0.215,
 0.22731829573934836,
 0.12239389776889774,
 0.3666666666666667,
 0.32083333333333336,
 0.012499999999999999,
 0.14273268398268396,
 0.13819444444444445,
 0.2731600935828877]

In [11]:
playlist = teareview_dict['Black Tea'][0]['Tea Reviews']

In [17]:
supertext=""
for i in playlist:
    supertext += (' '+ i)
supertext = re.sub("[’,;:–…]","", supertext).replace("(", '').replace(".", ' ').replace("!", ' ').replace(")", '')

In [18]:
text = TextBlob(supertext)

In [52]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
sorted(text.word_counts.items(), key = lambda tup:tup[1], reverse = True)

[('the', 63),
 ('a', 59),
 ('i', 52),
 ('of', 40),
 ('to', 39),
 ('and', 38),
 ('is', 36),
 ('this', 36),
 ('it', 28),
 ('for', 25),
 ('but', 24),
 ('tea', 21),
 ('that', 19),
 ('in', 18),
 ('my', 16),
 ('be', 14),
 ('its', 14),
 ('not', 14),
 ('we', 12),
 ('just', 11),
 ('are', 11),
 ('im', 10),
 ('as', 10),
 ('one', 9),
 ('very', 9),
 ('there', 9),
 ('all', 8),
 ('have', 8),
 ('chocolate', 8),
 ('with', 8),
 ('me', 8),
 ('on', 8),
 ('notes', 8),
 ('know', 7),
 ('few', 7),
 ('cup', 7),
 ('will', 7),
 ('at', 6),
 ('pie', 6),
 ('from', 6),
 ('you', 6),
 ('had', 6),
 ('love', 6),
 ('now', 6),
 ('black', 6),
 ('so', 6),
 ('really', 6),
 ('teas', 6),
 ('has', 5),
 ('was', 5),
 ('many', 5),
 ('good', 5),
 ('last', 5),
 ('need', 5),
 ('more', 5),
 ('also', 5),
 ('can', 5),
 ('sweet', 5),
 ('too', 5),
 ('like', 5),
 ('day', 4),
 ('an', 4),
 ('am', 4),
 ('if', 4),
 ('when', 4),
 ('out', 4),
 ('our', 4),
 ('much', 4),
 ('note', 4),
 ('would', 4),
 ('vacation', 4),
 ('which', 4),
 ('work', 4),
 

In [57]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = stopwords.words('english')
stop_words = stop_words + ['the','i','I','a','of',')','\'', 'to', 'it','and','is','this','for', 'but', 'that', 'in', 'my', 'not','husband',\
            'be', 'we', 'are', 'm', 'as', 'just', 'there', 'you','all','with','me', 'few', 'will', 'on','has', 'was','many','last'\
              '''()''', "'",'!','.','It',',', '-',':','Thanksgiving','Im','youll','Ive','Its','Also','A','As','This','cant','anybody',\
               'go','one','everybody','dont', 'We', 'us', 'got', 'And']

word_tokens = word_tokenize(supertext)
sentence = ''
filtered_sentence += [w for w in word_tokens if not w in stop_words]
for i in filtered_sentence:
    sentence += (' '+i)
 

print(filtered_sentence)




In [40]:
text1 = TextBlob(sentence)

In [45]:
adjlist = []
for i in text1.tags:
    if i[1]=='JJ' or i[1]=='JJR'or i[1]=='JJS':
        adjlist.append(i[0])

In [46]:
adjlist

['Happy',
 'American',
 'everywhere',
 'grateful',
 'everyday',
 'important',
 'least',
 'grateful',
 'wish',
 'wonderful',
 'pie',
 'free',
 'decorative',
 'vegan',
 'pie',
 'silken',
 'real',
 'tofu',
 'care',
 'good',
 'several',
 'crust',
 'pie',
 'last',
 'several',
 'bold',
 'tasty',
 'foresee',
 'old',
 'favourite',
 'tea',
 'mere',
 'shadow',
 'true',
 'much',
 'free',
 'bit',
 'fruitiness',
 'detailed',
 'crazy',
 'huge',
 'unbroken',
 'Fabulous',
 'second',
 'astringent',
 'spite',
 'good',
 're-steeped',
 'proper',
 'busy',
 'real',
 'ton',
 'new',
 'peek',
 'similar',
 'wrong',
 'everyday',
 'tea',
 'sweet',
 'astringent',
 'current',
 'everyday',
 'high',
 'first',
 'fantastic',
 'little',
 'low',
 'able',
 'scarce',
 'close',
 'finishing',
 'teas',
 'arent',
 'rest',
 'hear',
 'deeper',
 'much',
 'harvest',
 'last',
 'stronger',
 'stronger',
 'less',
 'fantastic',
 'upgrade',
 'detectable',
 'previous',
 'amazing',
 'last',
 'fill',
 'top',
 'light',
 'best',
 'extract',


In [42]:
text1.noun_phrases

WordList(['gratitude', 'american holiday hope', 'everywhere things', 'wonderful things', 'day making', 'pie crusts gluten', 'chocolate midnight originally chocolate almond midnight', 'praline almond', 'millenium cookbook', 'decadent chocolate cheesecake', 'silken tofu', 'silken tofu ‘', 'vegan years years', 'baby', 'tofu truth', 'didnt care good', 'part family feast year', 'gluten crust', 'crust making pie filling making', 'hearty breakfast', 'stacys taiwanese assam', 'times bold tasty', 'butiki', 'old favourite morning', 'wonder tea', 'mere shadow', 'true deliciousness', 'wowwwww', 'definitely', 'free ounce', 'strawberry oolong', 'laoshan black', 'malty chocolatey theres', 'timolino', 'crazy huge thin wiry unbroken', 'fabulous', 'astringent spite 3-minutes infusion', 'rating', 'proper evaluation week', 'vacation mid week', 'work vacation', 'office closure', 'weeks time', 'real vacation', 'ton tea', 'new teas', 'sneak peek', 'hattialli', 'paw', 'hattialli', 'lionchanging', 'cream chees