# Wine Region Topic Modeling
### Text Analytics Final Project

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

import re

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Reading in Data File

In [3]:
df1 = pd.read_csv('winemag_fin.csv', index_col = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
df = df1

In [5]:
df = df.drop_duplicates()[['region_1','country','description','points','price']].dropna()
df = df.rename(columns = {'region_1': 'region'})                      

In [6]:
df.region.value_counts().head(30)

Napa Valley                8285
Columbia Valley (WA)       7094
Russian River Valley       5142
California                 4692
Mendoza                    4307
Paso Robles                4000
Willamette Valley          3372
Alsace                     2712
Rioja                      2578
Finger Lakes               2350
Sonoma Coast               2344
Sonoma County              2317
Champagne                  2015
Toscana                    1990
Brunello di Montalcino     1985
Carneros                   1834
Barolo                     1818
Dry Creek Valley           1759
Walla Walla Valley (WA)    1750
Santa Barbara County       1732
Yakima Valley              1672
Sicilia                    1644
Sta. Rita Hills            1538
Alexander Valley           1444
Chianti Classico           1418
Lodi                       1368
Santa Lucia Highlands      1354
Santa Ynez Valley          1327
Central Coast              1299
Ribera del Duero           1228
Name: region, dtype: int64

## Removing Outliers and Grouping Data by Region

In [7]:
Q1 = np.percentile(df['price'], 10,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['price'], 90,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.price < (Q3+1.5*IQR)+1]

Q1 = np.percentile(df['points'], 5, interpolation = 'midpoint')#
 
Q3 = np.percentile(df['points'], 95, interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.points < (Q3+1.5*IQR)+1]

In [8]:
data1 = pd.DataFrame(df.region.value_counts())
data1.columns = ['counts']
df = df.groupby('region').agg({'country':'first','description':lambda x: ' '.join(x), 
                         'points':'mean', 
                         'price':'mean'})
df = df.join(data1)
df = df.reset_index()

In [9]:
counts_filter = df['counts'] > 1000
df = df[counts_filter]
df.sort_values(by='counts', ascending=False).head(20)

Unnamed: 0,region,country,description,points,price,counts
767,Napa Valley,US,"Soft, supple plum envelopes an oaky structure ...",88.632009,46.891501,7954
292,Columbia Valley (WA),US,"Aromas of cranberry, barrel spice and herb are...",88.680671,27.620753,7093
938,Russian River Valley,US,This wine is put together from multiple vineya...,89.274407,39.859588,5142
164,California,US,"Very deep in color and spicy-smoky in flavor, ...",85.222341,15.392027,4691
657,Mendoza,Argentina,Raw black-cherry aromas are direct and simple ...,86.184315,20.896672,4297
822,Paso Robles,US,This wine from the Geneseo district offers aro...,87.172966,32.451314,3995
1273,Willamette Valley,US,"Tart and snappy, the flavors of lime flesh and...",88.927003,34.321068,3370
23,Alsace,France,This dry and restrained wine offers spice in p...,89.563031,31.750832,2705
907,Rioja,Spain,"Slightly foxy on the nose, with rubbery aromas...",87.306342,27.596729,2507
453,Finger Lakes,US,A wisp of bramble extends a savory tone from n...,86.866809,20.109362,2350


In [34]:
napa_filter = df['region'] == 'Napa Valley'
napa = df[napa_filter]
napa_docs = napa['description'].tolist()

willanmetteValley_filter = df['region'] == 'Willamette Valley'
willanmetteValley = df[willanmetteValley_filter]
willanmetteValley_docs = willanmetteValley['description'].tolist()

fingerlakes_filter = df['region'] == 'Finger Lakes'
fingerlakes = df[fingerlakes_filter]
fingerlakes_docs = fingerlakes['description'].tolist()

alsace_filter = df['region'] == 'Alsace'
alsace = df[alsace_filter]
alsace_docs = alsace['description'].tolist()

mendoza_filter = df['region'] == 'Mendoza'
mendoza = df[mendoza_filter]
mendoza_docs = mendoza['description'].tolist()

columbiaValley_filter = df['region'] == 'Columbia Valley (WA)'
columbiaValley = df[columbiaValley_filter]
columbiaValley_docs = columbiaValley['description'].tolist()

toscana_filter = df['region'] == 'Toscana'
toscana = df[toscana_filter]
toscana_docs = toscana['description'].tolist()

rioja_filter = df['region'] == 'Rioja'
rioja = df[rioja_filter]
rioja_docs = rioja['description'].tolist()

# Topic Modeling

## Napa Valley (CA)

In [35]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['flavor','wine','cherry', 'fruit'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(napa_docs)):
    napa_docs[idx] = napa_docs[idx].lower()  # Convert to lowercase.
    napa_docs[idx] = tokenizer.tokenize(napa_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
napa_docs = [[token for token in doc if not token.isnumeric()] for doc in napa_docs]
    
# Remove stopwords.
napa_docs = [[token for token in doc if token not in stop_words] for doc in napa_docs]

# Remove words that are only one character.
napa_docs = [[token for token in doc if len(token) > 1] for doc in napa_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
napa_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in napa_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(napa_docs, min_count=10)
for idx in range(len(napa_docs)):
    for token in bigram[napa_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            napa_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(napa_docs)
corpus = [dictionary.doc2bow(doc) for doc in napa_docs]

sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

import numpy as np
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df = pd.DataFrame(matrix, columns=unique_token)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

lda = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Napa Valley (CA)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Napa Valley (CA)[0m
"flavor", "tannin", "blackberry", "oak", "cabernet", "dry", "black", "rich", "currant", "finish"




## Willamette Valley (OR)

In [12]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor', 'fruit', 'cherry','finish'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(willanmetteValley_docs)):
    willanmetteValley_docs[idx] = willanmetteValley_docs[idx].lower()  # Convert to lowercase.
    willanmetteValley_docs[idx] = tokenizer.tokenize(willanmetteValley_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
willanmetteValley_docs = [[token for token in doc if not token.isnumeric()] for doc in willanmetteValley_docs]
    
# Remove stopwords.
willanmetteValley_docs = [[token for token in doc if token not in stop_words] for doc in willanmetteValley_docs]

# Remove words that are only one character.
willanmetteValley_docs = [[token for token in doc if len(token) > 1] for doc in willanmetteValley_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
willanmetteValley_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in willanmetteValley_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(willanmetteValley_docs, min_count=10)
for idx in range(len(willanmetteValley_docs)):
    for token in bigram[willanmetteValley_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            willanmetteValley_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary1 = Dictionary(willanmetteValley_docs)
corpus1 = [dictionary1.doc2bow(doc) for doc in willanmetteValley_docs]

sort_token1 = sorted(dictionary1.items(),key=lambda k:k[0], reverse = False)
unique_token1 = [token.encode('utf8') for (ID,token) in sort_token1]

import numpy as np
matrix1 = gensim.matutils.corpus2dense(corpus1,num_terms=len(dictionary1),dtype = 'int')
matrix1 = matrix1.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df1 = pd.DataFrame(matrix1, columns=unique_token1)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary1[0]  # This is only to "load" the dictionary.
id2word1 = dictionary1.id2token

lda = LdaModel(
    corpus=corpus1,
    id2word=id2word1,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Willamette Valley (OR)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Willamette Valley (OR)[0m
"flavor", "pinot", "light", "tart", "vineyard", "oak", "apple", "hint", "tannin", "barrel"




## Columbia Valley (Washington)

In [13]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor', 'fruit', 'cherry','finish'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(columbiaValley_docs)):
    columbiaValley_docs[idx] = columbiaValley_docs[idx].lower()  # Convert to lowercase.
    columbiaValley_docs[idx] = tokenizer.tokenize(columbiaValley_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
columbiaValley_docs = [[token for token in doc if not token.isnumeric()] for doc in columbiaValley_docs]
    
# Remove stopwords.
columbiaValley_docs = [[token for token in doc if token not in stop_words] for doc in columbiaValley_docs]

# Remove words that are only one character.
columbiaValley_docs = [[token for token in doc if len(token) > 1] for doc in columbiaValley_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
columbiaValley_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in columbiaValley_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(columbiaValley_docs, min_count=10)
for idx in range(len(columbiaValley_docs)):
    for token in bigram[columbiaValley_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            columbiaValley_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary6 = Dictionary(columbiaValley_docs)
corpus6 = [dictionary6.doc2bow(doc) for doc in columbiaValley_docs]

sort_token6 = sorted(dictionary6.items(),key=lambda k:k[0], reverse = False)
unique_token6 = [token.encode('utf8') for (ID,token) in sort_token6]

import numpy as np
matrix6 = gensim.matutils.corpus2dense(corpus6,num_terms=len(dictionary6),dtype = 'int')
matrix6 = matrix6.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df6 = pd.DataFrame(matrix6, columns=unique_token6)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary6[0]  # This is only to "load" the dictionary.
id2word6 = dictionary6.id2token

lda = LdaModel(
    corpus=corpus6,
    id2word=id2word6,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Columbia Valley (WA)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Columbia Valley (WA)[0m
"flavor", "aroma", "cabernet", "blend", "tannin", "herb", "black", "spice", "merlot", "red"




## Finger Lakes (NY)

In [14]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['flavor', 'wine'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(fingerlakes_docs)):
    fingerlakes_docs[idx] = fingerlakes_docs[idx].lower()  # Convert to lowercase.
    fingerlakes_docs[idx] = tokenizer.tokenize(fingerlakes_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
fingerlakes_docs = [[token for token in doc if not token.isnumeric()] for doc in fingerlakes_docs]
    
# Remove stopwords.
fingerlakes_docs = [[token for token in doc if token not in stop_words] for doc in fingerlakes_docs]

# Remove words that are only one character.
fingerlakes_docs = [[token for token in doc if len(token) > 1] for doc in fingerlakes_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
fingerlakes_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in fingerlakes_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(fingerlakes_docs, min_count=10)
for idx in range(len(fingerlakes_docs)):
    for token in bigram[fingerlakes_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            fingerlakes_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary2 = Dictionary(fingerlakes_docs)
corpus2 = [dictionary2.doc2bow(doc) for doc in fingerlakes_docs]

sort_token2 = sorted(dictionary2.items(),key=lambda k:k[0], reverse = False)
unique_token2 = [token.encode('utf8') for (ID,token) in sort_token2]

import numpy as np
matrix2 = gensim.matutils.corpus2dense(corpus2,num_terms=len(dictionary2),dtype = 'int')
matrix2 = matrix2.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df2 = pd.DataFrame(matrix2, columns=unique_token2)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary2[0]  # This is only to "load" the dictionary.
id2word2 = dictionary2.id2token

lda = LdaModel(
    corpus=corpus2,
    id2word=id2word2,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Finger Lakes (NY)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Finger Lakes (NY)[0m
"finish", "palate", "flavor", "acidity", "note", "dry", "apple", "cherry", "fresh", "riesling"




## Toscana (Italy)

In [15]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor','fruit', 'cherry'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(toscana_docs)):
    toscana_docs[idx] = toscana_docs[idx].lower()  # Convert to lowercase.
    toscana_docs[idx] = tokenizer.tokenize(toscana_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
toscana_docs = [[token for token in doc if not token.isnumeric()] for doc in toscana_docs]
    
# Remove stopwords.
toscana_docs = [[token for token in doc if token not in stop_words] for doc in toscana_docs]

# Remove words that are only one character.
toscana_docs = [[token for token in doc if len(token) > 1] for doc in toscana_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
toscana_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in toscana_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(toscana_docs, min_count=10)
for idx in range(len(toscana_docs)):
    for token in bigram[toscana_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            toscana_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary3 = Dictionary(toscana_docs)
corpus3 = [dictionary3.doc2bow(doc) for doc in toscana_docs]

sort_token3 = sorted(dictionary3.items(),key=lambda k:k[0], reverse = False)
unique_token3 = [token.encode('utf8') for (ID,token) in sort_token3]

import numpy as np
matrix3 = gensim.matutils.corpus2dense(corpus3,num_terms=len(dictionary3),dtype = 'int')
matrix3 = matrix3.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df3 = pd.DataFrame(matrix3, columns=unique_token3)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary3[0]  # This is only to "load" the dictionary.
id2word3 = dictionary3.id2token

lda = LdaModel(
    corpus=corpus3,
    id2word=id2word3,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Toscana (Italy)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Toscana (Italy)[0m
"aroma", "black", "tannin", "cabernet", "blend", "sangiovese", "palate", "spice", "merlot", "sauvignon"




## Alsace (France)

In [16]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor','fruit', 'cherry','finish'])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(alsace_docs)):
    alsace_docs[idx] = alsace_docs[idx].lower()  # Convert to lowercase.
    alsace_docs[idx] = tokenizer.tokenize(alsace_docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
alsace_docs = [[token for token in doc if not token.isnumeric()] for doc in alsace_docs]
    
# Remove stopwords.
alsace_docs = [[token for token in doc if token not in stop_words] for doc in alsace_docs]

# Remove words that are only one character.
alsace_docs = [[token for token in doc if len(token) > 1] for doc in alsace_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
alsace_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in alsace_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(alsace_docs, min_count=10)
for idx in range(len(alsace_docs)):
    for token in bigram[alsace_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            alsace_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary4 = Dictionary(alsace_docs)
corpus4 = [dictionary4.doc2bow(doc) for doc in alsace_docs]

sort_token4 = sorted(dictionary4.items(),key=lambda k:k[0], reverse = False)
unique_token4 = [token.encode('utf8') for (ID,token) in sort_token4]

import numpy as np
matrix4 = gensim.matutils.corpus2dense(corpus4,num_terms=len(dictionary4),dtype = 'int')
matrix4 = matrix4.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df4 = pd.DataFrame(matrix4, columns=unique_token4)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary4[0]  # This is only to "load" the dictionary.
id2word4 = dictionary4.id2token

lda = LdaModel(
    corpus=corpus4,
    id2word=id2word4,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Alsace (France)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Alsace (France)[0m
"palate", "dry", "drink", "note", "nose", "ripe", "apple", "freshness", "fresh", "flavor"




## Mendoza (Argentina)

In [17]:
#Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor','fruit', 'cherry','finish'])

#Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(mendoza_docs)):
    mendoza_docs[idx] = mendoza_docs[idx].lower()  # Convert to lowercase.
    mendoza_docs[idx] = tokenizer.tokenize(mendoza_docs[idx])  # Split into words.

#Remove numbers, but not words that contain numbers.
mendoza_docs = [[token for token in doc if not token.isnumeric()] for doc in mendoza_docs]
    
#Remove stopwords.
mendoza_docs = [[token for token in doc if token not in stop_words] for doc in mendoza_docs]

#Remove words that are only one character.
mendoza_docs = [[token for token in doc if len(token) > 1] for doc in mendoza_docs]

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
mendoza_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in mendoza_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(mendoza_docs, min_count=10)
for idx in range(len(mendoza_docs)):
    for token in bigram[mendoza_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            mendoza_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary5 = Dictionary(mendoza_docs)
corpus5 = [dictionary5.doc2bow(doc) for doc in mendoza_docs]

sort_token5 = sorted(dictionary5.items(),key=lambda k:k[0], reverse = False)
unique_token5 = [token.encode('utf8') for (ID,token) in sort_token5]

import numpy as np
matrix5 = gensim.matutils.corpus2dense(corpus5,num_terms=len(dictionary5),dtype = 'int')
matrix5 = matrix5.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df5 = pd.DataFrame(matrix5, columns=unique_token5)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary5[0]  # This is only to "load" the dictionary.
id2word5 = dictionary5.id2token

lda = LdaModel(
    corpus=corpus5,
    id2word=id2word5,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Mendoza (Argentina)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Mendoza (Argentina)[0m
"flavor", "aroma", "palate", "berry", "plum", "feel", "nose", "black", "blackberry", "note"




### Rioja (Spain)

In [18]:
#Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['wine', 'flavor','fruit', 'cherry','finish'])

#Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(rioja_docs)):
    rioja_docs[idx] = rioja_docs[idx].lower()  # Convert to lowercase.
    rioja_docs[idx] = tokenizer.tokenize(rioja_docs[idx])  # Split into words.

#Remove numbers, but not words that contain numbers.
rioja_docs = [[token for token in doc if not token.isnumeric()] for doc in rioja_docs]
    
#Remove stopwords.
rioja_docs = [[token for token in doc if token not in stop_words] for doc in rioja_docs]

#Remove words that are only one character.
rioja_docs = [[token for token in doc if len(token) > 1] for doc in rioja_docs]

rioja# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
rioja_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in rioja_docs]

# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(rioja_docs, min_count=10)
for idx in range(len(rioja_docs)):
    for token in bigram[rioja_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            rioja_docs[idx].append(token)
            
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary7 = Dictionary(rioja_docs)
corpus7 = [dictionary7.doc2bow(doc) for doc in rioja_docs]

sort_token7 = sorted(dictionary7.items(),key=lambda k:k[0], reverse = False)
unique_token7 = [token.encode('utf8') for (ID,token) in sort_token7]

import numpy as np
matrix7 = gensim.matutils.corpus2dense(corpus7,num_terms=len(dictionary7),dtype = 'int')
matrix7 = matrix7.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df7 = pd.DataFrame(matrix7, columns=unique_token7)

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary5[0]  # This is only to "load" the dictionary.
id2word7 = dictionary7.id2token

lda = LdaModel(
    corpus=corpus7,
    id2word=id2word7,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(1):
    print(color.BOLD + (f'Top 10 words for Rioja (Spain)') + color.END)
    print(", ".join(re.findall('".*?"',topic)))
    print('\n')

<IPython.core.display.Javascript object>

[1mTop 10 words for Rioja (Spain)[0m
"flavor", "aroma", "berry", "plum", "palate", "feel", "drink", "oak", "red", "note"




## Best Bang for your Buck Wineries

In [36]:
df2 = df1

In [37]:
df2 = df2.drop_duplicates()[['winery','region_1','country','points','price']].dropna()
df = df2.rename(columns = {'region_1': 'region'})    

In [38]:
df = df[(df["region"]=='Finger Lakes') | (df["region"]=='Alsace')  | (df["region"]=='Rioja') | (df["region"]=='Willamette Valley') | (df["region"]=='Mendoza') | (df["region"]=='Columbia Valley (WA)') | (df["region"]=='Toscana') | (df["region"]=='Napa Valley')]

In [39]:
df

Unnamed: 0,winery,region,country,points,price
2,Rainstorm,Willamette Valley,US,87,14.0
4,Sweet Cheeks,Willamette Valley,US,87,65.0
7,Trimbach,Alsace,France,87,24.0
9,Jean-Baptiste Adam,Alsace,France,87,27.0
10,Kirkland Signature,Napa Valley,US,87,19.0
...,...,...,...,...,...
149617,Standing Stone,Finger Lakes,US,84,13.0
149626,Beringer,Napa Valley,US,84,16.0
149627,Marc Kreydenweiss,Alsace,France,84,21.0
149630,Pine Ridge,Napa Valley,US,84,27.0


In [40]:
Q1 = np.percentile(df['price'], 10,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['price'], 90,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.price < (Q3+1.5*IQR)+1]

Q1 = np.percentile(df['points'], 5, interpolation = 'midpoint')#
 
Q3 = np.percentile(df['points'], 95, interpolation = 'midpoint')
IQR = Q3 - Q1
df = df[df.points < (Q3+1.5*IQR)+1]

In [41]:
wine = df.groupby('winery').agg({'region':'first','country':'first',
                         'points':'mean', 
                         'price':'mean'})
df2 = wine
wine = wine.reset_index()
df2 = df2.reset_index()
df2 = df2.rename(columns = {'region': 'r2','country': 'c2', 'points': 'norm_points', "price": 'norm_price'})

### Normalizing Variables, Creating Wine Calc Variable

In [42]:
cols_to_norm = ['norm_points','norm_price']
df2[cols_to_norm] = df2[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [43]:
wine = wine.merge(df2,on='winery', how='left')

In [44]:
best_wine_fin = wine
best_wine_fin['wine_calc'] = (best_wine_fin['norm_points']-best_wine_fin['norm_price'])
best_wine_fin = wine[['winery','region','country','points','norm_points','price','norm_price','wine_calc']]
best_wine_fin = best_wine_fin.set_index('winery')
#best_wine_fin['countyGroup'] = best_wine_fin['countyGroup'].replace({1.0: 'Napa County', 2.0: 'Sonoma County', 3.0 : 'Central Coast', 4.0: 'Northern California', 5.0 : 'Washington', 6.0 : 'Oregon'})

In [45]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: lightgreen' if v else '' for v in is_max]

In [46]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_min = s == s.min()
    return ['background-color: red' if v else '' for v in is_min]

In [53]:
best_wine = best_wine_fin.sort_values(by='wine_calc', ascending=False).head(5)
best_wine.style.set_caption("Top Bang for Your Buck Wineries").apply(highlight_max, subset = best_wine.columns[-1])

Unnamed: 0_level_0,region,country,points,norm_points,price,norm_price,wine_calc
winery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Julien Schaal,Alsace,France,94.75,0.819444,32.5,0.202128,0.617317
Kevin White,Columbia Valley (WA),US,94.0,0.777778,28.0,0.170213,0.607565
Philippe-Lorraine,Napa Valley,US,92.333333,0.685185,21.666667,0.125296,0.55989
Proteus,Willamette Valley,US,93.0,0.722222,28.0,0.170213,0.552009
Bestheim,Alsace,France,92.5,0.694444,25.0,0.148936,0.545508


In [52]:
worst_wine = best_wine_fin.sort_values(by='wine_calc', ascending=True).head(5)
worst_wine.style.set_caption("Worst Bang for Your Buck Wineries").apply(highlight_min, subset = best_wine.columns[-1])

Unnamed: 0_level_0,region,country,points,norm_points,price,norm_price,wine_calc
winery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Better Half,Napa Valley,US,86.0,0.333333,130.0,0.893617,-0.560284
Concrete,Napa Valley,US,81.0,0.055556,75.0,0.503546,-0.447991
Horned Toad,Napa Valley,US,83.0,0.166667,85.0,0.574468,-0.407801
Calla Lily,Napa Valley,US,88.0,0.444444,120.0,0.822695,-0.378251
Pont de Chevalier,Napa Valley,US,90.0,0.555556,135.0,0.929078,-0.373522
