In [4]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
import requests
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ejfel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ejfel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from sklearn.cluster import KMeans

In [5]:
with open('coffee_words.pickle','rb') as read_file:
    coffee = pickle.load(read_file)
with open('coffee_ratings.pickle','rb') as read_file:
    ratings = pickle.load(read_file)
with open('combined.pickle','rb') as read_file:
    combined = pickle.load(read_file)
with open('df.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [7]:
df = pd.DataFrame()
df['Roaster'] = coffee['Roaster']
df['TextA'] = coffee.Review + coffee.Notes + coffee.TLDR
df['TextB'] = df.TextA.str.replace('[^ ]+\.[^ ]+','',regex=True)
df['Text'] = df.TextB.str.replace(r'Visit.*\n?','',regex=True)
df.drop(columns=['TextA','TextB'],inplace=True)
df

Unnamed: 0,Roaster,Text
0,Jackrabbit Java,"Yeasty, richly sweet-savory. Fresh-baked bread..."
1,Jackrabbit Java,"Balanced, sweet-toned, floral. Tea rose, cocoa..."
2,Red Rooster Coffee Roaster,"Delicate, deep; complex. Pomegranate, macadami..."
3,Paradise Roasters,"Very sweet, floral-toned. Freesia, pink grapef..."
4,Kakalove Cafe,"Opulent, richly sweet-tart-savory. Black curra..."
...,...,...
5954,The Coffee Beanery,A light-medium-roasted blend with power: The a...
5955,Starbucks Coffee,The rest of the taste profile plays peek-a-boo...
5956,Peerless Coffee,"Given the medium roast, the carbon notes here ..."
5957,Gevalia,"For such a relatively light roast, not particu..."


In [9]:
documents = df.Text
  
# raw documents to tf-idf matrix: 
vectorizer = TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True)
# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components=100,         #// num dimensions
                         algorithm='randomized',
                         n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(documents)
# svd_matrix can later be used to compare documents, compare words, or compare queries with documents

In [14]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-win_amd64.whl (24.2 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-4.2.0.tar.gz (119 kB)
Collecting Cython==0.29.14
  Downloading Cython-0.29.14-cp38-cp38-win_amd64.whl (1.7 MB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-4.2.0-py3-none-any.whl size=109637 sha256=2b5bfd7dc24d2dc3b4b2154e8e71b77f8bb6a14476bfd1b16acca7e0286b197e
  Stored in directory: c:\users\ejfel\appdata\local\pip\cache\wheels\24\f6\ea\70a0761bdfaeacff66662751fe71920e25c4c43d97098a3886
Successfully built smart-open
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.21
    Uninstalling Cython-0.29.21:
      Successfully uninstalled Cython-0.29.21
Successfully installed Cython-0.29.14 gensim-3.

In [19]:

from gensim.corpora import Dictionary
# from gensim.corpora.Dictionary import load_from_text, doc2bow
from gensim.corpora import MmCorpus
from gensim.models.ldamodel import LdaModel
document = df.Text
# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text('wiki_en_wordids.txt')
# load corpus iterator
mm = MmCorpus('wiki_en_tfidf.mm')
# extract 100 LDA topics, updating once every 10,000
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)
# use LDA model: transform new doc to bag-of-words, then apply lda
doc_bow = Dictionary.doc2bow(document.split())
doc_lda = lda[doc_bow]
# doc_lda is vector of length num_topics representing weighted presence of each topic in the doc

FileNotFoundError: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'

## LDA from new site

In [20]:
vectorizer = CountVectorizer(min_df=10, stop_words = 'english',max_df=3200)
doc_word = vectorizer.fit_transform(df.Text.str.replace(r'\d+','',regex=True))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
# vectorize the corpus
count_vectorizer = CountVectorizer(min_df=10, max_df=3200, ngram_range=(1,1), stop_words='english')
tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=3200, ngram_range=(1,1), stop_words='english')

# calculate the feature matrix
feature_matrix = count_vectorizer.fit_transform(df.Text.str.replace(r'\d+','',regex=True))
tfidf_feature_matrix = tfidf_vectorizer.fit_transform(df.Text.str.replace(r'\d+','',regex=True))


In [24]:
from sklearn.decomposition import LatentDirichletAllocation

# Instantiate the LDA model
lda_model = LatentDirichletAllocation(n_components=2, max_iter=100, learning_method='online', random_state=43,
                                     batch_size=128, evaluate_every=-1, n_jobs=-1)

# fit transform the feature matrix
lda_output = lda_model.fit_transform(feature_matrix)

# display the lda_output and its shape
display(lda_output)
display(lda_output.shape)

array([[0.82106008, 0.17893992],
       [0.83884816, 0.16115184],
       [0.9878912 , 0.0121088 ],
       ...,
       [0.0141855 , 0.9858145 ],
       [0.01430916, 0.98569084],
       [0.01325895, 0.98674105]])

(5959, 2)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [2, 3, 4, 5, 10, 15, 20, 25], 'learning_decay': [.5, .7, .9]}

# Init the model
lda = LatentDirichletAllocation()

# Init Grid Search class
model = GridSearchCV(lda, search_params)

model.fit(feature_matrix)
best_lda_model = model.best_estimator_
print("Best model's params: ", model.best_params_)
print("Best log likelihood score: ", model.best_score_)
print("Model perplexity: ", best_lda_model.perplexity(feature_matrix))

In [None]:
df_cv_results = pd.DataFrame(model.cv_results_)
df_cv_results.to_csv("LDAGridSearchResults.csv", header=True, index=False, encoding='utf-8')

In [None]:
sns.pointplot(x="param_n_components", y="mean_test_score", hue="param_learning_decay", data=df_cv_results)


In [None]:
best_lda_model

In [None]:
# Create a document to topic matrix
lda_output = best_lda_model.transform(feature_matrix)
# column names
topicnames = ['Topic_' + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ['Doc_' + str(i) for i in range(len(clean_reviews_text))]

# create a dataframe
df_document_topic = pd.DataFrame(np.round(lda_output,2), columns=topicnames, index=docnames)

df_document_topic.head()

In [None]:
# dominant topic
df_document_topic['dominant_topic'] = np.argmax(df_document_topic.values, axis=1)
df_document_topic.head()

In [None]:
sns.countplot(df_document_topic.dominant_topic)

In [3]:
combined.head()

Unnamed: 0,roaster,origin,roast_level,group,rating,aroma,body,flavor,aftertaste,acidity
0,Jackrabbit Java,Costa Rica,Medium-Light,1,93,9.0,9.0,9.0,8.0,8.0
1,Jackrabbit Java,"Nyamasheke District, Rwanda",Medium-Light,0,92,9.0,8.0,9.0,8.0,8.0
2,Red Rooster Coffee Roaster,"Los Naranjos, La Argentina, Huila Department, ...",Light,0,96,9.0,9.0,10.0,9.0,9.0
3,Paradise Roasters,"Huila, Colombia",Light,0,95,9.0,9.0,9.0,9.0,9.0
4,Kakalove Cafe,"Antioquia Department, Colombia",Medium-Light,1,95,9.0,9.0,9.0,9.0,9.0


In [84]:
combined.loc[(combined.aroma == 'NR')|(combined.aroma == 'NA'),'aroma'] = '-999'
combined.aroma = combined.aroma.astype(float)
combined.aroma = combined.aroma.round(0)

combined.loc[(combined.body == 'NR')|(combined.body == 'NA'),'body'] = '-999'
combined.body = combined.body.astype(float)
combined.body = combined.body.round(0)

combined.loc[(combined.flavor == 'NR')|(combined.flavor == 'NA'),'flavor'] = '-999'
combined.flavor = combined.flavor.astype(float)
combined.flavor = combined.flavor.round(0)

combined.aftertaste.fillna('-999',inplace=True)
combined.aftertaste = combined.aftertaste.astype(float)
combined.aftertaste = combined.aftertaste.round(0)

combined.loc[(combined.acidity == 'NR')|(combined.acidity == 'NA')|(combined.acidity == 'na')|(combined.acidity == 'n/a'),'acidity'] = '-999'
combined.loc[(combined.acidity == 'Very Low'),'acidity'] = '1'
combined.loc[(combined.acidity == 'Low'),'acidity'] = '3'
combined.loc[(combined.acidity == 'Moderate'),'acidity'] = '5'
combined.acidity.fillna(-999,inplace=True)
combined.acidity = combined.acidity.astype(float)
combined.acidity = combined.acidity.round(0)

combined.dropna(subset=['rating','aroma','body','flavor','aftertaste','acidity'],axis=0,inplace=True)
# pd.get_dummies(combined,columns=['group'])
combined.head()

  res_values = method(rvalues)


Unnamed: 0,roaster,origin,roast_level,group,rating,aroma,body,flavor,aftertaste,acidity
0,Jackrabbit Java,Costa Rica,Medium-Light,1,93,9.0,9.0,9.0,8.0,8.0
1,Jackrabbit Java,"Nyamasheke District, Rwanda",Medium-Light,0,92,9.0,8.0,9.0,8.0,8.0
2,Red Rooster Coffee Roaster,"Los Naranjos, La Argentina, Huila Department, ...",Light,0,96,9.0,9.0,10.0,9.0,9.0
3,Paradise Roasters,"Huila, Colombia",Light,0,95,9.0,9.0,9.0,9.0,9.0
4,Kakalove Cafe,"Antioquia Department, Colombia",Medium-Light,1,95,9.0,9.0,9.0,9.0,9.0
