In [None]:
#Importing the packages needed for this analysis
import pandas as pd
import numpy as np
import math
from scipy import stats
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer


%matplotlib inline
%config InlineBackend.figure_format = 'png'
plt.rcParams['figure.dpi']= 400

In [None]:
#hard coding hybrid recommendation element
tastingflavors = {'Spicy':['cocoa','clove', 'vanilla','pepper', 'saffron','nutmeg','licorice','menthol','cinnamon'],\
'Char':['ash','tar','toast','wood smoke','tobacco','fireplace','burnt food','grilled food'],\
'Sweet':['malt','brown sugar','candy','honey','caramel','molasses','burnt sugar','maple syrup','sweet'],\
'Nutty':['almond', 'peanut','walnut','chestnut','hazelnut', 'roasted nuts'],\
'Floral':['rose','hops','orchid','violet','jasmine','perfume','geranium','dandelion','honeysuckle','lily of the valley','orange blossom'],\
'Herbs':['thyme','parsley','cardamom','eucalyptus','fennel seed','coriander'],\
'Vegetables':['spinach','broccoli','zucchini','asparagus','garden peas','green pepper','squash blossom'],\
'Grass':['stems','straw','barnyard','grapeseed','fresh cut grass','grass'],\
'Wood':['evergreen','bark','cedar','resin','freshly cut wood','sawdust','wet wood','driftwood','green wood','cherry wood'],\
'Earth':['peat','moss','musty','leather','compost','wet earth','forest floor','decaying wood'],\
'Mineral':['salt', 'metalllic', 'wet rocks'],\
'Marine':['seawood','ocean air'],\
'Berry':['raspberry','strawberry','blackberry', 'black currant'],\
'Citrus':['lemon','orange','grapefruit','citrus zest'],\
'Tree Fruit':['peach','pear','apricot','apple','cooked fruit','dried fruit'],\
'Tropical':['mango','melon','lychee', 'banana','pineapple'],
'Malolactic':['butter']}

wheel = {'Earthy':['Wood','Earth','Mineral','Marine'],'Vegetal':['Grass','Vegetables','Herbs'],\
         'Fruity':['Berry','Citrus','Tree Fruit','Tropical']}


In [None]:
#reading a pickle file reading to pick up where i left off in case something fails or i start over
with open("tea_data.pkl", 'rb') as picklefile: 
    teareview_dict = pickle.load(picklefile)

In [None]:
#reading in the list of tea dictionaries, each being by itself
with open('items_data.pkl', 'rb') as picklefile:
    tea_list = pickle.load(picklefile)

In [None]:
#reading in the dict of users, user name as key, and 3 lists inside
with open('user_data.pkl', 'rb') as picklefile:

    user_list = pickle.load(picklefile)

In [None]:
#filtering out duplicates
new_list=[]
for i in tea_list:
    if i not in new_list:
        new_list.append(i)

In [None]:
#initializing Mongo Client
import json
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.tea_database


In [None]:
#to large of a document to insert all at once.
users = db.users
post_id = users.insert_one(user_list).inserted_id

In [None]:
teas = db.teas

In [None]:
db.collection_names

In [None]:
#inserting all the teas into mongodb
tearesults = teas.insert_many(tea_list)

In [None]:
#combining all tea reviews for one tea into a single dictionary
reviewcount = []
count =0
for i in teareview_dict:
    count=0
    for j in teareview_dict[i]: 
        count+=len(j['Tea Reviews'])
    reviewcount.append(count)


In [None]:
teareview_dict['Black Tea'][0]

In [None]:
#Cleaning tea names to make them easier to call in flask app
itemdf = pd.DataFrame(tea_list)
newname=[]
import re

for i in itemdf['Tea Name']:
    line = re.sub('[!@#$\'\",]', '', i)
    newname.append(line)
itemdf['Tea Name'] = newname

In [None]:
itemdf.head()

In [None]:
#Creating user dataframe
userdf = pd.DataFrame.from_dict(user_list, orient='index')
userdf.columns = ['tea links','Tea Names', 'Score']

In [None]:
userdf.head()

In [None]:
#left in case I decided to take a different decomposition approach
"""for user,i in zip(userdf['Tea Names'], userdf.index):
    for ind,j in enumerate(user):
        teascore= userdf.loc[i,'Score'][ind]
        if teascore=='/span':
            teascore=0
        teascore_list.append(teascore)
userdf = userdf.fillna(0)"""

In [None]:
userdf.head()

## More Data Wrangling

Now that I have the dataframe imported, I am going to use NLP to work with the different reviews on certain teas to create more insights and set it up for unsupervised learning.

### Tasting Considerations
* Aroma: The odor of the tea liquor, also called the nose or fragrance. A complex aroma is often described as a bouquet. 
* Astringency: A lively and mouth-drying effect on the tongue. Not bitter, but a clean and refreshing quality. The sensation of astringency is caused by a reaction between polyphenols (tannins) and the protein in saliva. 
* Body: The tactile aspect of tea’s weight and substance in the mouth, variously subcategorized as light, medium, or full; also known as fullness. 
* Bright: A lively, clean style that refreshes the palate. 
* Character: A tea’s signature attributes depending upon origin, whether of its country, region or type. 
* Clean: Indicates purity of flavor and an absence of any off-tastes. 
* Finish: The lasting taste on your tongue after swallowing the tea. 
* Flowery: A floral nose or flavor associated with high grade teas. 
* Full: References a positive sensation of body and good heft; indicates a well-made tea, possessing color, strength, substance and roundness. 
* Malty: A sweet malt flavor that is characteristic of Assam black teas. 
* Muscatel: A flavor reminiscent of grapes, most often used to describe an exceptional characteristic found in the liquors of the finest Darjeelings. 
* Smooth: Round-bodied, fine-drinking teas. 
* Soft: Smooth, lush, and subsequently often (but not necessarily) timid in flavor; not a negative term. 
* Thick: Describes liquor having substance, but not necessarily strength. 
* Vegetal: A characteristic of green teas that might include grassy, herby or marine flavors.

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
teareview_dict['Green Tea'][0].keys()

### Polarity Score
Using TextBlob, I will be creating a polarity score for each review.  This is to help weight the reviews if needed. 

In [None]:
polarityscore = []
for i in teareview_dict:
    for j in teareview_dict[i]: 
        for review in j['Tea Reviews']:
            q = TextBlob(review)
            polarityscore.append(q.sentiment.polarity)
        j['Polarity']=polarityscore
        polarityscore=[]

In [None]:
teareview_dict['Black Tea'][0]['Polarity']

### Proportion Flavor Wheel and Mouthfeel

Using the flavor wheel found online, I will create a flavor profile for each tea. I will need to use 1 and 2 n-grams and the tastingflavors dictionary.

I also want to get any 'mouthfeel' data to see what added bonus it could give to the tea. 

In [None]:
#creating flavor profile based on the tastingflavors dict, stored as proportions
def getProp(text1):
    text1=TextBlob(text1)
    count=0
    tot_count=0
    proportion_dict = {}
    for i in tastingflavors:
        count=0
        adjlist=[]
        for j in text1.tags:
            if j[1]=='JJ' or j[1]=='JJR'or j[1]=='JJS' or j[1]=='NN'or j[1]=='NNP'or j[1]=='NNS':
                abb = re.sub("y","", i[0])
                if(i[0] !=abb):
                    adjlist.append(abb.lower())
                    adjlist.append(j[0].lower())
                else:
                    adjlist.append(j[0].lower())
        for k in adjlist:
            if k in tastingflavors[i]:
                count+=1
        for k in text1.ngrams(n=2):
            if ' '.join(k.lower()) in tastingflavors[i]:
                count+=1
        tot_count+=count
        proportion_dict[i]=count
    if tot_count!=0:
        for i in proportion_dict:
            proportion_dict[i] = proportion_dict[i]/tot_count
        
    return proportion_dict, adjlist

In [None]:
#function pulling out mouthfeel data and filtering out unnecessary data, input is the revlist
def getReviews(revlist):
    supertext=""
    mouthfeel = ''
    for i in revlist:
        supertext += (' '+ i)
        if re.findall(r"([^[.!]]*?mouthfeel[^.]*\.)",i):
            mouthfeel=' '.join((re.findall(r"([^.!,]*?mouthfeel[^.!,]*\.)",i)))

    supertext = re.sub("[’,;:–…]","", supertext).replace("(", '').replace(".", ' ').replace("!", ' ').replace(")", '')
    supertext= re.sub("(-)"," ", supertext)
    supertext= re.sub("(chocolate)","cocoa", supertext)
    word_tokens = word_tokenize(supertext)
    sentence = ''
    mouthblob = TextBlob(mouthfeel)
    mouthadj = []
    for i in mouthblob.tags:
        if i[1]=='JJ' or i[1]=='JJR'or i[1]=='JJS':
            mouthadj.append(i[0])
    
    
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    for i in filtered_sentence:
        sentence += (' '+i)
    return supertext, mouthadj

In [None]:
#creating a flavor profile from both reviews and flavors filled in by customers, also makign a mouthfeel profile if found
newdictlist = []
for i in teareview_dict:
    newdictlist = []
    for j in range(len(teareview_dict[i])):
        flavtext = ''
        reviewtext, mouthfeel= getReviews(teareview_dict[i][j]['Tea Reviews'])
        proportions = getProp(reviewtext)
        if teareview_dict[i][j]['Tea Flavors']:
            flavtext = teareview_dict[i][j]['Tea Flavors']
        custproportions= getProp(flavtext)
        dict2 = teareview_dict[i][j].copy()
        dict2['Mouthfeel']=mouthfeel
        dict2['Flavor Profile Cust']=custproportions
        dict2['Flavor Profile Reviews']=proportions
        dict2['Reviews Supertext']=reviewtext
        newdictlist.append(dict2)
    teareview_dict[i] = newdictlist

In [None]:
"""with open('totstea_data.pkl', 'wb') as picklefile:
    pickle.dump(teareview_dict, picklefile)"""

In [None]:
from pprint import pprint
for i in teareview_dict['Green Tea']:
    pprint(i)

## KMeans

Using Kmeans to cluster my data to create recommendations based on an input vector.

In [None]:
#Importing important tools for clustering with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

vect = CountVectorizer(max_df=.95, min_df=2)
tsvd = SVD()

In [None]:
#reading a pickle file reading to pick up where i left off in case something fails or i start over
with open("/Users/deven/Documents/pickleddata/projectfletcher/totstea_data.pkl", 'rb') as picklefile: 
    teareview_dict = pickle.load(picklefile)

In [None]:
#stacking dictionaries into a dataframe
teadf = pd.DataFrame()
for i in teareview_dict:
        newdf = pd.DataFrame.from_dict(teareview_dict[i])
        teadf=pd.concat([teadf,newdf],ignore_index=True)

In [None]:
#creating falvor profile df
teaflavdf = pd.DataFrame(list(teadf['Flavor Profile Cust']))


In [None]:
#combining dataframes
teadf.reset_index(drop=True,inplace=True)
teadf = pd.concat([teadf,teaflavdf], axis=1)

In [None]:
teadf.drop('Flavor Profile Reviews', axis=1, inplace=True)


In [None]:
#copying df to experiment with
playset = teaflavdf.copy()
#teaflavdf=pd.concat([teaflavdf,teaflavdf2], axis=1)

In [None]:
#initializing KMeans
km = KMeans(n_clusters = 14)
km.fit(playset)

In [None]:
#initializing important variables
mu_digits = km.cluster_centers_
kmlabels = km.labels_
custpref = [ 0,  1.17647059e-02,  0, 0,  0,  0, -4.33680869e-19,  6.93889390e-18,  1.35525272e-20,\
         4.33680869e-19,  5.98930481e-02,  3.46944695e-18, 7.76470588e-02,  6.93889390e-18,  0, 8.50695187e-01,  8.67361738e-19]

In [None]:
#defining a function to find the closest teas to a specific flavor profile based on euclidean distance, returns (index, dist) pair
def Rec(labels, clstr,cust):
    clustlist = []
    tearecs=[]
    teaind=[]
    for ind, i in enumerate(labels):
        if i ==clstr:
            clustlist.append(ind)
    newdf= playset.iloc[clustlist,:]
    for i in range(len(newdf)):
        tearecs.append((newdf.index[i],sum(euclidean_distances([newdf.iloc[i,:]], [cust]))/len(euclidean_distances([newdf.iloc[i,:]], [cust]))))
    mindist = sorted(tearecs)
    return tearecs

In [None]:
tearecs = Rec(kmlabels,km.predict([custpref])[0],custpref)

In [None]:
#Defining a function that takes tea rec (index,dist), then pulls the tea names based on smallest dist values
def getTeaNames(tearec):
    teanames = []
    mindist = sorted(tearec, key=lambda x:x[1])
    teanames = [w[0] for w in mindist[:3]]
    teanames = teadf.iloc[teanames,:]['Tea Name']
    return teanames


In [None]:
teanames = getTeaNames(tearecs)
teanames

### Determining amount of Clusters

Using the silhoutte score to find the optimal cluster amount.  Also, testing out which clustering method is the best for my dataset.

In [None]:
#importing functions
from sklearn.cluster import SpectralClustering, AgglomerativeClustering

In [None]:
#initializing functions
sc = SpectralClustering()
ac = AgglomerativeClustering()

In [None]:
db.labels_

In [None]:
#using Spectral clustering, find the best silhouette score based on increasing k values
Sil_coefs = []
for k in range(2,20):
    sc = SpectralClustering(n_clusters = k)
    sc.fit(teaflavdf)
    labels = sc.labels_
    Sil_coefs.append(metrics.silhouette_score(teaflavdf, labels, metric='euclidean'))
fig, ax1 = plt.subplots(1,1, figsize=(15,5), sharex=True)
k_clusters = range(2,20)
ax1.plot(k_clusters, Sil_coefs)
ax1.plot(k_clusters, sc.inertia_)
ax1.set_title('Spectral Cluster')
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')
plt.xticks(np.arange(2, 20, step=2))

In [None]:
#using agglomerative clustering, find the best silhouette score based on increasing k values
Sil_coefs = []
for k in range(2,20):
    ac = AgglomerativeClustering(n_clusters = k)
    ac.fit(teaflavdf)
    labels = ac.labels_
    Sil_coefs.append(metrics.silhouette_score(teaflavdf, labels, metric='euclidean'))
fig, ax1 = plt.subplots(1,1, figsize=(15,5), sharex=True)
k_clusters = range(2,20)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')
ax1.set_title('Agg Cluster')
plt.xticks(np.arange(2, 20, step=2))

In [None]:
#using KMeans clustering, find the best silhouette score based on increasing k values
Sil_coefs = []
for k in range(2,20):
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(teaflavdf)
    labels = km.labels_
    Sil_coefs.append(metrics.silhouette_score(teaflavdf, labels, metric='euclidean'))

In [None]:
fig, ax1 = plt.subplots(1,1, figsize=(15,5), sharex=True)
k_clusters = range(2,20)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_title('KMeans Cluster')
ax1.set_ylabel('silhouette coefficient')
plt.xticks(np.arange(2, 20, step=2))


### Flavor Profile PCA

Looking at the distribution of flavor profile data if fitted to 2 features

In [None]:
teadf = teadf.set_index('Tea Name')

In [None]:
reviewvect = vect.fit_transform(teadf[['Reviews Supertext','Tea Flavors']])
#flavvect = vect.fit_transform(teadf['Tea Flavors'])

In [None]:
pd.DataFrame(reviewvect.toarray(), index=example, columns=vectorizer.get_feature_names()).head(10)
dtm = dtm.asfptype()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [None]:
principalComponents = pca.fit_transform(teaflavdf)
X = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

In [None]:
plt.scatter(X['principal component 1'], X['principal component 2'], c=kmlabels, s=50, cmap='viridis')

centers = km.cluster_centers_
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
principalDf.columns

## SVD with Surprise

In [None]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVDpp,SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
userdf.head()

In [None]:
#creating a df for surprise analysis from userdf 
teascore_list=[]
teauser_list=[]
teaname_list=[]
        
for user,i in zip(userdf['Tea Names'], userdf.index):
    for ind,j in enumerate(user):
        teascore= userdf.loc[i,'Score'][ind]
        if teascore=='/span':
            teascore=0
        teascore_list.append(teascore)
        teauser_list.append(i)
        teaname_list.append(re.sub('[!@#$\'\",]', '', j))
newdf=pd.DataFrame({'Tea Name': teaname_list,
     'Score': teascore_list,
     'User Name': teauser_list
    })

In [None]:
"""with open('surprise_data.pkl', 'wb') as picklefile:
    pickle.dump(newdf, picklefile)"""

In [None]:
#adding names and classes from survey
names = ['maya','THE Jonathan', 'Kelly', 'Amy', 'Sakura', 'Dan','Anonymous','Travis', 'Chad', 'the_og_jonathan','Vicky', 'Cyrus', 'Deven']
teas = ['Irish Breakfast','Earl Grey', 'Pre Rain Organic Dragon Well Supreme (Long Jing)', 'supreme pu-erh', 'Loose leaf white teas', 'Gyokuro', 'Chai',\
        'Peppermint Tea', 'chamomile','rishi tropical hibiscus', 'organic english breakfast','jasmine dragon pearls']

In [None]:
teas

In [None]:
#adding classmate scores, all of this needed to be hard coded
classrate=[]
classrate=[[55,95,25,45,0,25,85,90,0,0,0,0],[5,75,95,20,80,25,85,25,25,0,80,0], [75,95,85,85,55,85,65,0,95,0,0,0], \
[95,35,65,0,0,35,5,0,0,85,0,0],[95,75,65,55,45,55,15,0,0,0,0,0],[45,55,55,15,15,55,65,75,0,0,0,0],\
[95,95,0,15,0,0,95,0,0,0,0,0], [95,95,25,35,0,25,75,0,0,0,95,0], [55,65,85,0,95,75,45,0,0,0,0,0],\
[55,55,75,45,75,75,95,95,0,0,0,0],[35,35,95,95,45,95,95,0,0,0,0,90],[65,65,55,0,15,85,95,0,0,0,0,0],\
[35,75,86,55,70,85,75,85,85,65,40,90]]

In [None]:
for index,i in enumerate(names):
    for ind, k in enumerate(classrate[index]):
        newdf = pd.concat([newdf,pd.DataFrame([[k,teas[ind], i]], columns = ['Score', 'Tea Name', 'User Name'])], ignore_index=True)
        
        

In [None]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 100))
algo=SVD()
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(newdf[['User Name', 'Tea Name', 'Score']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=4)

In [None]:
#SVD is a better predictor, albeit still a bit off
cross_validate(algo, data, cv=4)

In [None]:
from collections import defaultdict
def get_top_n(predictions, n=3):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
trainset = data.build_full_trainset()
algo.fit(trainset)

In [None]:
#generating predictions for unrated teas based on what users have rated
testset = trainset.build_anti_testset()
predictions = algo.test(testset)



In [None]:
want= []
for i in predictions:
    if i[0] in names:
        want.append(i)

In [None]:
top_n = get_top_n(want, n=3)


In [None]:
recsdf = pd.DataFrame(top_n)

In [None]:
recsdf

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')
stop_words = stop_words + ['the','i','I','a','of',')','\'', 'to', 'it','and','is','this','for', 'but', 'that', 'in', 'my', 'not','husband',\
            'be', 'we', 'are', 'm', 'as', 'just', 'there', 'you','all','with','me', 'few', 'will', 'on','has', 'was','many','last'\
              '''()''', "'",'!','.','It',',', '-',':','Thanksgiving','tea','Im','youll','Ive','Its','Also','A','As','This','cant','anybody',\
               'go','one','everybody','dont', 'We', 'us', 'got', 'And']

In [None]:
#adding flavor profiles to allow for a hybrid approach
newdictlist = []
dict2={}
totsteareviews = []
for j in tea_list:
    flavtext = ''
    reviewtext=''
    dict2={}
    adjlist=[]
    flavs = []
    reviewtext, mouthfeel= getReviews(j['Tea Reviews'])
    proportions, adjlist = getProp(reviewtext)
    if j['Tea Flavors']!='<dd class="empty">Not available':
        flavtext = j['Tea Flavors']
        custproportions, flavs= getProp(flavtext)
        dict2['Flavor Profile Cust']=custproportions
    else: 
        dict2['Flavor Profile Cust']=0
    dict2['Review Adj'] = adjlist+flavs
    dict2['Tea Name'] = j['Tea Name']
    dict2['Mouthfeel']=mouthfeel
    dict2['Flavor Profile Reviews']=proportions
    dict2['Reviews Supertext']=reviewtext
    totsteareviews.append(reviewtext)
    newdictlist.append(dict2)


In [None]:
"""#saving list as it takes forever for it to run
with open("newdatalist.pkl", 'wb') as picklefile: 
    pickle.dump(newdictlist,picklefile)"""

In [None]:
#reading in the list of users
with open('/Users/deven/Documents/pickleddata/projectfletcher/newdatalist.pkl', 'rb') as picklefile:
    newdictlist = pickle.load(picklefile)

In [None]:
newteaprofiledf = pd.DataFrame(newdictlist)

In [None]:
#flavor profile Cust is the most accurate
newteaprofiledf.head()

## Hybrid Model

Creating a linear regresssion model to predict the 'actual' predicted rating of teas to counter the 'cold start up' problem in collaborative recommendation systems.

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [None]:
#initializing the get functions to find nearest points
def Rec(labels, clstr,cust):
    clustlist = []
    tearecs=[]
    teaind=[]
    for ind, i in enumerate(labels):
        if i ==clstr:
            clustlist.append(ind)
    newdf= playset.iloc[clustlist,:]
    for i in range(len(newdf)):
        tearecs.append((newdf.index[i],sum(euclidean_distances([newdf.iloc[i,:]], [cust]))/len(euclidean_distances([newdf.iloc[i,:]], [cust]))))
    mindist = sorted(tearecs)
    return tearecs
def getTeaNames(tearec):
    teanames = []
    mindist = sorted(tearec, key=lambda x:x[1])
    teanames = [w[0] for w in mindist[:3]]
    teanames = teadf.iloc[teanames,:]['Tea Name']
    return teanames

In [None]:
top_n[]

In [None]:
#trying to find which of the top 10 are closest to the other teas the user has tried, based on average, may need to rethink this 
avetearate = []
tearate = 0
count=0
flag=0
for i in top_n:
    for k in top_n[i]:
        userrecs = newdf[newdf['User Name']==i]['Tea Name']
        avetearate=[]
        if len(userrecs) <5:
            if flag==1:
                break
            count=0
            for i in userrecs['Tea Name']:
                eudis=(euclidean_distances(newteaprofiledf[newteaprofiledf['Tea Name']==i]['Flavor Profile Reviews'], newteaprofiledf[newteaprofiledf['Tea Name']==k[0]]['Flavor Profile Reviews']))
                tearate +=eudis
                count+=1
            avetearate.append((tearate/count))
            tearate=0
        flag=1

In [None]:
df1 = newteaprofiledf[['Tea Name', 'Flavor Profile Cust']]

In [None]:
hybrid1 = pd.merge(newdf,df1,how='inner')

In [None]:
newcols=[]
for i in hybrid1['Flavor Profile Cust']:
    newcols.append(i)

In [None]:
inter = pd.DataFrame(newcols)
print(len(inter))
print(len(hybrid1))

In [None]:
hybrid1 = pd.concat([hybrid1, inter], axis=1)

In [None]:
hybrid1.drop('Flavor Profile Cust', inplace=True, axis=1)

In [None]:
hybrid1.head()

In [None]:
algopredicts = []
for i,k in zip(hybrid1['Tea Name'],hybrid1['User Name']):
    algopredicts.append(round(algo.predict(k, i).est))

In [None]:
hybrid1['Algo']=algopredicts

In [None]:
hybrid1.sample(10)

In [None]:
hybrid1['Algo'] = preprocessing.scale(hybrid1['Algo'])

In [None]:
hybrid1['Score'] = preprocessing.scale(hybrid1['Score'])

In [None]:
y = hybrid1['Score']
X = hybrid1.drop(['Tea Name','User Name','Score'], axis=1)

In [None]:
lr = LinearRegression()
lg = LogisticRegression()

In [None]:
#hybriddf = pd.DataFeame(predictions)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y, test_size=.3, random_state=8)

In [None]:
lr.fit(xtrain,ytrain)

In [None]:
#lg.fit(xtrain,ytrain)

In [None]:
lr.score(xtest, ytest)

In [None]:
est =  ElasticNetCV(l1_ratio = .15, cv=20, n_alphas= 200)
est.fit(xtrain,ytrain)
est.score(xtest,ytest)

In [None]:
#Getting multiplicative error to see my models absolute fit
rms = math.sqrt(mean_squared_error(ytest, est.predict(xtest)))
math.exp(rms)

In [None]:
def diagnostic_plot(x, y):
    plt.figure(figsize=(20,5))
        
    pred = lr.predict(x)
    plt.subplot(1, 3, 2)
    res = y - pred
    plt.scatter(pred, res)
    plt.title("Residual plot")
    plt.xlabel("prediction")
    plt.ylabel("residuals")
    
    plt.subplot(1, 3, 3)
    #Generates a probability plot of sample data against the quantiles of a 
    # specified theoretical distribution 
    stats.probplot(res, dist="norm", plot=plt)
    plt.title("Normal Q-Q plot")

In [None]:
#Checking residuals and quantile plots
#diagnostic_plot(xtest, ytest)

## Doc2Vec

Experimenting with Doc2vec to see if there is any relation between tea reviews based on tea type.

In [None]:
import gensim
import os
import collections
import smart_open
import random


In [None]:
itemdf.head()

In [None]:
train=[]
test=[]
words=''
def read_corpus(fname, tokens_only=False):
    for i in fname:
        for k,line in enumerate(i):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                if len(gensim.utils.simple_preprocess(line))<50:
                    yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [k])

In [None]:
len(gensim.utils.simple_preprocess(itemdf['Tea Reviews'][0][0]))

In [None]:
train = itemdf['Tea Reviews'][:round(len(itemdf)*.8)]
test = itemdf['Tea Reviews'][-round((len(itemdf)*.2)):]

In [None]:
train[0]

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=55)

In [None]:
train_corpus = list(read_corpus(train))
test_corpus = list(read_corpus(test, tokens_only=True))
model.build_vocab(train_corpus)

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

In [None]:
#model.save('teadocmodel.bin')
model = gensim.models.doc2vec.Doc2Vec.load('teadocmodel.bin')

## Doc2Vec with Books

Using doc to vec to see if I can recommend books based on tea flavor profiles. 

In [None]:
books = nltk.corpus.gutenberg.fileids()

In [None]:
bookt = ['Emma by Jane Austen', 'Persuassion by Jane Austen', 'Sense and Sensibility by Jane Austen',\
        'Poems by William Blake', 'The Little People of the Snow by William Bryant', 'The Adventures of Buster Bear by Thornton Burgress'\
        'Alice in Wonderland by Lewis Carroll','The Ball and the Cross by G.K. Chesterton','The Wisdom of Father Brown by G.K. Chesterton'\
        'The Ball and the Cross by G.K. Chesterton', 'The Parents Assistant by Maria Edgeworth','Moby Dick by Herman Melville',\
        'Paradise Lost by John Milton', 'Shakespeares Works','Shakespeares Works','Shakespeares Works', 'Leaves of Grass by Walt Whitman']

In [None]:
beat = {}
for i,k in zip(books, bookt):
    beat[i]=k

In [None]:
doclen = []
train = []
def read_corpus1(fname, tokens_only=False):
    for i in fname:
        for k,line in enumerate(i):
            if tokens_only:
                train.append(gensim.utils.simple_preprocess(line))
            else:
                # For training data, add tags
                if len(gensim.utils.simple_preprocess(line))<50:
                    train.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [k]))
        doclen.append(k)
    return train, doclen

In [None]:
bookrec = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=55)

In [None]:
bookwords=[]
for i in books:
    book = ' '.join(nltk.corpus.gutenberg.words(i))
    bookwords.append(book)

In [None]:
btrain,doclen = list(read_corpus1(bookwords))
test_corpus = newteaprofiledf['Review Adj'].values

In [None]:
'''with open("doclen.pkl", 'wb') as picklefile: 
    pickle.dump(doclen,picklefile)'''

In [None]:
with open('/Users/deven/Documents/pickleddata/projectfletcher/btrain.pkl', 'rb') as picklefile:
    btrain = pickle.load(picklefile)

In [None]:
bookrec.build_vocab(btrain)

In [None]:
bookrec = gensim.models.doc2vec.Doc2Vec.load('/Users/deven/Documents/pickleddata/projectfletcher/bookrec.bin')

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = bookrec.infer_vector(test_corpus[doc_id])
sims = bookrec.docvecs.most_similar([inferred_vector])

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % bookrec)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2)]:
    print(u'%s %s: \n' % (label, sims[index]))

In [None]:
tot=0
for ind, i in enumerate(doclen):
    tot+=i
    if sims[0][0]==btrain[ind][1]:
        rec = nltk.corpus.gutenberg.fileids()[ind-1]
        break
print(rec)

In [None]:
def getBookrec(iid):
    test_corpus = newteaprofiledf[newteaprofiledf['Tea Name']==iid]['Review Adj'].values[0]
    inferred_vector = bookrec.infer_vector(test_corpus)
    sims = bookrec.docvecs.most_similar([inferred_vector])
    rec=''
    tot=0
    for ind, i in enumerate(doclen):
        tot+=i
        if sims[0][0]<tot:
            rec = bookt[ind-1]
            break
    return rec,

In [None]:
bookreclist = []
for i in names:
    teaid= top_n[i][0][0]
    bookreclist.append(getBookrec(teaid))

In [None]:
print(bookreclist)
print(names)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

In [None]:
frec=[]
srec=[]
trec=[]
for i in top_n:
    frec.append(top_n[i][0][0])
    srec.append(top_n[i][1][0])
    trec.append(top_n[i][2][0])

In [None]:
sims

In [None]:
#bookrec.save('bookrec.bin')


In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.tools.set_credentials_file(username='djmorcode', api_key='g4D9PR85TaaUkKlH8CWZ')

In [None]:
trace = go.Table(
    header=dict(values=['Name', 'Tea Rec 1','Tea Rec 2','Tea Rec 3','Book Recommendation'],
                line = dict(color='#7D7F80'),
                fill = dict(color='#a1c3d1'),
                align = ['left'] * 5),
    cells=dict(values=[names,frec,srec,trec,bookreclist],
               line = dict(color='#7D7F80'),
               fill = dict(color='#EDFAFF'),
               align = ['left'] * 5))

layout = dict(width=1000, height=800)
data = [trace]
fig = dict(data=data, layout=layout)
py.iplot(fig, filename = 'styled_table')

In [None]:
top_n

In [None]:
print(bookreclist)
print(names)

## LDA Books

Trying the recommendation system with LDA for recommending books.  ***Work in progress**

In [None]:
from gensim import corpora, models, similarities, matutils

In [None]:
cv = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words=stop_words)
cv.fit(bookwords)
teawords = cv.transform(newteaprofiledf['Review Adj'])

In [None]:
# Creating matrix, then transposing it so the terms are the rows
counts = cv.transform(bookwords).transpose()

In [None]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)
compareset = matutils.Sparse2Corpus(teawords)

In [None]:
#saving mapping for later use
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

In [None]:
lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

In [None]:
lda.print_topics()

In [None]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
lda_corpus

In [None]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [None]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[:]

## Word2Vec

Tryign my hand at word to vec to see if it will work for a NLP analysis of reviews.

In [None]:
allwords = nltk.corpus.gutenberg.words()

In [None]:
len(allwords)

In [None]:
def read_corpus(fname):
    for i in fname:
        for line in i:
            if len(gensim.utils.simple_preprocess(line))<50:
                 yield [x for x in gensim.utils.simple_preprocess(line) if len(x)>2]

In [None]:
train_corpus = list(read_corpus(train))
test_corpus = list(read_corpus(test))

In [None]:
len(train_corpus)

In [None]:
model1 = gensim.models.Word2Vec(train_corpus, size=100, window=5, min_count=1, workers=4,sg=1)

In [None]:
#model1.save('teawordmodel.bin')
model1 = gensim.models.word2vec.Word2Vec.load('teawordmodel.bin')

In [None]:
list(model.wv.vocab.items())[:7]


In [None]:
print(model['bright'])

In [None]:
# Similarity
model.most_similar('mouthfeel' ,topn=8)

In [None]:
model.similarity('green','tea')

In [None]:
model.n_similarity(['bread', 'dog'], ['cat', 'dog'])

In [None]:
model.doesnt_match("rabbit cow raven turtle".split())

## Creating wrappers

Creating a list of wrappers to copy into a JS file for flask app.

In [None]:
#Giving wrappers a try
def p_decorate(func):
   def func_wrapper(name):
       return "{"+func(name)+"},"
   return func_wrapper
@p_decorate
def getval(string):
    return 'value: +{0}+,'.format(string)+'\n'+' text: +{0}+'.format(string)
convert_text = p_decorate(getval)

In [None]:
def loopit(list1):
    new_list = []
    for i in list1:
        new_list.append(getval(i))
        
    return new_list

In [None]:
uniteas = loopit(itemdf['Tea Name'].unique())

In [None]:
from pprint import pprint
pprint(uniteas)