# Import Packages 


In [1]:
import numpy as np # linear algebra
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd 
import matplotlib.pyplot as plt
from IPython.display import display
import base64
import string
import re
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

stopwords = stopwords.words('english')

# Import Dataset 


In [2]:
reviews = pd.read_csv("./wine_reviews.csv", usecols =['points', 'title', 'description', 'variety', 'price'], encoding='latin1')
reviews = reviews.dropna()
reviews = reviews.drop_duplicates()
reviews = reviews.reset_index(drop=True)
reviews['summary'] = ''
reviews.head(15)

Unnamed: 0,description,points,price,title,variety,summary
0,"This is ripe and fruity, a wine that is smooth...",87,15.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,
1,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,
2,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,
3,"Much like the regular bottling from 2012, this...",87,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,
4,Blackberry and raspberry aromas show a typical...,87,15.0,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,
5,"Here's a bright, informal red that opens with ...",87,16.0,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,
6,This dry and restrained wine offers spice in p...,87,24.0,Trimbach 2012 Gewurztraminer (Alsace),GewÃ¼rztraminer,
7,Savory dried thyme notes accent sunnier flavor...,87,12.0,Heinz Eifel 2013 Shine GewÃ¼rztraminer (Rheinh...,GewÃ¼rztraminer,
8,This has great depth of flavor with its fresh ...,87,27.0,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,
9,"Soft, supple plum envelopes an oaky structure ...",87,19.0,Kirkland Signature 2011 Mountain CuvÃ©e Cabern...,Cabernet Sauvignon,


# Text preprocessing

In [3]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
def normalize_text(text):
    tm1 = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    tm2 = re.sub('<code>.*?</code>', '', tm1, flags=re.DOTALL)
    tm3 = re.sub('<[^>]+>©', '', tm1, flags=re.DOTALL)
    return tm3.replace("\n", "")

In [7]:
reviews['description_Cleaned_1'] = reviews['description'].apply(normalize_text)

In [8]:
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~©'
# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
    texts = []
    doc = nlp(docs, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    tokens = ' '.join(tokens)
    texts.append(tokens)
    return pd.Series(texts)

reviews['Description_Cleaned'] = reviews['description_Cleaned_1'].apply(lambda x: cleanup_text(x, False))

In [9]:
print('Before normalizing text-----\n')
print(reviews['description'][2])
print('\nAfter normalizing text-----\n')
print(reviews['description_Cleaned_1'][2])

Before normalizing text-----

Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.

After normalizing text-----

Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.


In [11]:
def textrank_summarize(text, num_sentences):
    sentences = sent_tokenize(text)

    sentence_matrix = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        sentence_matrix.append(' '.join(words))
    
    vectorizer = CountVectorizer().fit_transform(sentence_matrix)
    sentence_bag_of_words = vectorizer.toarray()
    
    similarity_matrix = cosine_similarity(sentence_bag_of_words)
    
    sentence_graph = nx.from_numpy_array(similarity_matrix)
    
    sentence_ranks = nx.pagerank(sentence_graph)
  
    ranked_sentences = sorted(((sentence_ranks[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    summary_sentences = [sentence for rank, sentence in ranked_sentences[:num_sentences]]
    
    summary = ' '.join(summary_sentences)
    return summary



In [12]:
reviews['summary'] = reviews['Description_Cleaned'].apply(lambda x: textrank_summarize(x, 2))


In [13]:
reviews.description.iloc[100]

'Lots of spearmint, coyote mint, hot licorice, ginger snaps and Dr Pepper spice up the strawberry fruit of this wine that provides a very herbal take on the grape. Oregano, marjoram, thyme and dill all make a showing on the sip, against a backbone of sweet cherry and blackberry fruit, finishing on cedar.'

In [14]:
reviews.summary.iloc[100]

'oregano marjoram thyme dill make showing sip backbone sweet cherry blackberry fruit finish cedar . lot spearmint coyote mint hot licorice ginger snap dr pepper spice strawberry fruit wine provide herbal take grape .'

In [15]:
reviews.to_csv('dataset_with_summary.csv')

In [135]:
data = reviews['summary'].astype(str).tolist()

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [136]:
max_epochs = 10
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9


In [137]:
test_data = word_tokenize("wine stainless steel ferment . tart snappy flavor lime flesh rind dominate .")

In [159]:
inferred_vector = model.infer_vector(test_data)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print('Test Document : «{}»\n'.format(' '.join(test_data)))
for label, index in [('TOP-1', 0), ('TOP-2', 1), ('TOP-3', 2), ('TOP-4', 3), ('TOP-5', 4)]:
    print(u'%s\nTitle: %s\nPoint: %s\nVariety: %s\nPrice: %s \n' % (label, reviews.title.iloc[int(sims[index][0])], reviews.points.iloc[int(sims[index][0])], reviews.variety.iloc[int(sims[index][0])], reviews.price.iloc[int(sims[index][0])],))

Test Document : «wine stainless steel ferment . tart snappy flavor lime flesh rind dominate .»

TOP-1
Title: Rainstorm 2013 Pinot Gris (Willamette Valley)
Point: 87
Variety: Pinot Gris
Price: 14.0 

TOP-2
Title: Lucie 2015 Dutton Ranch Widdoes Vineyard Pinot Noir (Russian River Valley)
Point: 92
Variety: Pinot Noir
Price: 60.0 

TOP-3
Title: Federico Paternina 2007 Banda Azul Crianza Red  (Rioja)
Point: 81
Variety: Tempranillo Blend
Price: 10.0 

TOP-4
Title: Coopers Creek 2013 Pinot Noir (Hawke's Bay)
Point: 85
Variety: Pinot Noir
Price: 16.0 

TOP-5
Title: Caves Velhas 2011 Magna Carta Reserva Red (Alentejano)
Point: 87
Variety: Portuguese Red
Price: 17.0 



In [139]:
model.save("d2v2.model")
