In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression

%matplotlib inline

# The User Functions

The following functions will be used to customize the parse, pos, stop, stem process necessary for text analysis. These are done using the NLTK package, customized to remove certain words and symbols, and handle synonyms.

In [2]:
# my_analyzer replaces both the preprocessor and tokenizer
# it also replaces stop word removal and ngram constructions

def my_analyzer(s):
    # Synonym List
    syns = {'veh': 'vehicle', 'car': 'vehicle', 'chev':'cheverolet', \
              'chevy':'cheverolet', 'air bag': 'airbag', \
              'seat belt':'seatbelt', "n't":'not', 'to30':'to 30', \
              'wont':'would not', 'cant':'can not', 'cannot':'can not', \
              'couldnt':'could not', 'shouldnt':'should not', \
              'wouldnt':'would not'}
    
    # Preprocessing String s
    s = s.lower()
    s = s.replace(',', '. ')
    # Tokenize
    tokens = word_tokenize(s)
    tokens = [word.replace(',', '') for word in tokens]
    tokens = [word for word in tokens if ('*' not in word) and \
              ("''" != word) and ("``" != word) and \
              (word!='description') and (word !='dtype') \
              and (word != 'object') and (word!="'s")]
    
    #Map Synonyms
    for i in range(len(tokens)):
        if tokens[i] in syns:
            tokens[i] = syns[tokens[i]]
            
    #Remove Stop Words
    punctuation = list(string.punctuation) + ['..', '...']
    pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
    stop = stopwords.words("english") + punctuation + pronouns
    filtered_terms = [word for word in tokens if (word not in stop) and \
                     (len(word) > 1) and (not word.replace('.', '', 1).isnumeric()) \
                     and (not word.replace("'", '', 2).isnumeric())]
    
    #Lemmatization & Stemming - Stemming with WordNet POS
    # Since lemmatization requires POS need to set POS
    tagged_words = pos_tag(filtered_terms, lang='eng')
    # Stemming with for terms without WordNet POS
    stemmer = SnowballStemmer("english")
    wn_tags = {'N':wn.NOUN, 'J':wn.ADJ, 'V':wn.VERB, 'R':wn.ADV}
    wnl = WordNetLemmatizer()
    stemmed_tokens = []
    for tagged_word in tagged_words:
        term = tagged_word[0]
        pos = tagged_word[1]
        pos = pos[0]
        try:
            pos = wn_tags[pos]
            stemmed_tokens.append(wnl.lemmatize(term, pos=pos))
        except:
            stemmed_tokens.append(stemmer.stem(term))
    
    return stemmed_tokens

# Further Customization of Stopping and Stemming using NLTK
def my_preprocessor(s):
    s.lower()
    s = s.replace(',', '. ')
    print('preprocessor')
    return s

def my_tokenizer(s):
    # Tokenize
    print('Tokentizer')
    tokens = word_tokenize(s)
    tokens = [word.replace(',','') for word in tokens ]
    tokens = [word for word in tokens if word.find('*')!=True and \
              word != "''" and word !="``" and word!='description' \
              and word !='dtype']
    return tokens

# Read Document
The following code reads the document and places its contents into a string california_chardonnay.

In [3]:
# Increase Pandas column width to let pandas read large text columns
pd.set_option('max_colwidth', 32000)
# Import data
df = pd.read_excel("CaliforniaCabernet.xlsx")

df.head()

In [None]:
# Setup simple constants
n_docs     = len(df)
n_samples  = n_docs
m_features = 100
s_words    = 'english'
ngram = (1,2)

# Setup reviews in list 'discussions'
discussions = df['description'].tolist()

# Latent Dirichlet Analysis

In [4]:
%%time
# LDA For Term Frequency x Doc Matrix
n_topics        = 9
max_iter        =  5
learning_offset = 20.
learning_method = 'online'

# LDA for TF-IDF x Doc Matrix
# First Create Term-Frequency/Inverse Doc Frequency by Review Matrix

tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=m_features,\
                             analyzer=my_analyzer, ngram_range=ngram)
tf_idf = tfidf_vect.fit_transform(discussions)
print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n")

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                                learning_method=learning_method, \
                                learning_offset=learning_offset, \
                                random_state=12345)

lda.fit(tf_idf)
print('{:.<22s}{:>6d}'.format("Number of Reviews", tf_idf.shape[0]))
print('{:.<22s}{:>6d}'.format("Number of Terms",     tf_idf.shape[1]))
print("\nTopics Identified using LDA with TF_IDF")
tf_features = tfidf_vect.get_feature_names()
max_words = 15
for topic_idx, topic in enumerate(lda.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([tf_features[i]
                             for i in topic.argsort()[:-max_words - 1:-1]])
        print(message)
        print()


TF_IDF Vectorizer Parameters
 TfidfVectorizer(analyzer=<function my_analyzer at 0x000001B816A6BAE8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.95,
        max_features=100, min_df=2, ngram_range=(1, 2), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) 

Number of Reviews..... 13135
Number of Terms.......   100

Topics Identified using LDA with TF_IDF
Topic #0: licorice mountain hard style elegant classic tannin wine could currant blackberry flavor year black make

Topic #1: one vintage best age new oak wine fruit tannin flavor rich fine blackberry cabernet year

Topic #2: palate nose pepper black aroma plum wine full-bodied dark fruit finish tobacco blueberry cherry flavor

Topic #3: sweet like flavor blackberry taste ch

In [5]:
prob = lda.transform(tf_idf)

df['class'] = np.argmax(prob, axis=1)

Unnamed: 0,Review,description,year,points,price,winery,Region,class
0,1,"This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.",,96,235.0,Heitz,Napa,8
1,17,"This blockbuster, powerhouse of a wine suggests blueberry pie and chocolate as it opens in the glass. On the palate, it's smooth and seductively silky, offering complex cedar, peppercorn and peppery oak seasonings amidst its dense richness. It finishes with finesse and spice.",,95,325.0,Hall,Napa,2
2,48,"Blended with 9% Malbec, 9% Cabernet Franc and 5% Petit Verdot, this is a perennial classic for the winery, the sister brand of Cuvaison. Juicy in cherry and cassis, it sustains big, pillowy tannins and tar, suggesting more time for the fruit to match up with the structure. Drink through 2020.",,90,60.0,Brandlin,Napa,6
3,68,"From the producer's monumental Atlas Peak vineyard, this is a tightly wound, solidly constructed mountain Cab, blended with a handful of Petit Verdot. Tobacco, black tea and a sliver of coconut intermingle around a medium-bodied whole that will benefit from cellaring, through 2021.",,91,85.0,Michael Mondavi Family Estate,Napa,7
4,70,"A juiciness of cherry and vanilla spark the opening of this wine, a celebration of the vintage, appellation and in this case, fruit-forwardness of the variety. With a backbone of oak and cedar, it has smooth tannins and medium weight, finishing in mocha chocolate. Drink now through 2022.",,91,60.0,Provenance Vineyards,Napa,4
5,71,"Sweetened tannins highlight a depth of chocolate and blackberry in this 100% varietal wine made in small amounts. Soft and juicy, it crafts a balance of richness and density around elegant spicy clove and cinnamon.",,91,85.0,Raymond,Napa,0
6,79,"This wine over-delivers in quality for the price. It's made from mostly Cabernet Sauvignon, with handfuls of Merlot, Malbec, Petit Verdot and Cabernet Franc blended in to good effect. The whole combines into layers of chocolate and graham cracker, dappled in vanilla and jam. Thick tannins suggest a swirl or two of the glass. The finish is all tobacco.",,91,28.0,B Side,Napa,7
7,88,"From a multiplicity of vineyards, this wine works and is a tremendous value for the price. It mixes hot mulled-cider spices, cinnamon, clove and pine forest, conveying surprising depth. Plum, dark cherry and more clove lead to a powerful, leathery finish.",,91,22.0,Eagle Glen,Napa,2
8,91,"Bright, light oak shadings dress up this medium-bodied wine, complementing the red cherry and strawberry flavors. It's fresh, fruity and not very tannic—easy to drink and enjoy.",,86,10.0,Belle Ambiance,California Other,5
9,98,"Blended with small amounts of Merlot, Cabernet Franc, Petit Verdot and Malbec, this is an austere, unfussy red made in modest quantities that has crisp red currant and cranberry flavors interwoven with cedar and dried herb.",,86,36.0,Tin Barn,Sonoma,6


In [24]:
df['class'].value_counts()

5    2099
3    2077
8    1692
2    1501
6    1289
7    1213
0    1212
4    1122
1     930
Name: class, dtype: int64

# Predict the Price

In [37]:
df = df[df.price.notnull()]

X = df.drop(['description', 'winery', 'Review', 'price'], 1)
X.Region = X.Region.astype("category")
X.year = X.year.astype("category")
X['class'] = X['class'].astype('category')
X = pd.get_dummies(X)

y = df.price

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

params = {'alpha' : np.logspace(-3, 3, 10)}

model = Ridge(fit_intercept = True)

ridge_cv = GridSearchCV(model, param_grid = params, cv = 10, scoring = 'neg_mean_squared_error')

ridge_cv.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [38]:
ridge_cv.cv_results_

{'mean_fit_time': array([ 0.01450603,  0.01744399,  0.01875193,  0.01791725,  0.02036328,
         0.01985409,  0.01814232,  0.0193831 ,  0.01846106,  0.01586735]),
 'mean_score_time': array([ 0.00030172,  0.00010028,  0.00156271,  0.00060148,  0.00099387,
         0.00090184,  0.00060103,  0.00166306,  0.00070076,  0.00010009]),
 'mean_test_score': array([-1052.67809119, -1052.67698801, -1052.67201631, -1052.65175173,
        -1052.5940549 , -1052.51379675, -1052.53318273, -1053.54096711,
        -1057.46612119, -1073.37115356]),
 'mean_train_score': array([-1034.43902176, -1034.43912989, -1034.44138137, -1034.48306646,
        -1034.98058625, -1037.05065385, -1039.52376478, -1042.23480696,
        -1048.3472776 , -1066.73093976]),
 'param_alpha': masked_array(data = [0.001 0.0046415888336127772 0.021544346900318832 0.10000000000000001
  0.46415888336127775 2.154434690031882 10.0 46.415888336127729
  215.44346900318823 1000.0],
              mask = [False False False False False False