# Topic Modeling

In [8]:
#imports
import pandas as pd
import numpy as np
import os
import re
import dill
import sys
from scipy.stats import uniform, randint
from string import punctuation
import nltk

# preprocessing packages
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# pipeline tools
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import FunctionTransformer

#feature selection
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#models
from sklearn.naive_bayes import GaussianNB, MultinomialNB

#metrics
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.metrics import accuracy_score, balanced_accuracy_score

#visualization
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models

  and should_run_async(code)


In [9]:
#nltk.download()

  and should_run_async(code)


In [10]:
#directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
processed_data_folder = parent_directory + '/data/wine-com/processed/'
models_folder = parent_directory + '/models/'

  and should_run_async(code)


### Load Data

In [11]:
df = pd.read_csv(processed_data_folder + '1677432096.083379.txt', 
                 sep = '|')

  and should_run_async(code)


In [12]:
df.head()

  and should_run_async(code)


Unnamed: 0,product_url,product_name,product_variety,product_origin,product_family,user_avg_rating,user_rating_count,winemaker_description,reviewer_name,reviewer_rating,reviewer_text
0,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Decanter,92.0,part proyecto garnachas de españa collection s...
1,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Wilfred Wong of Wine.com,91.0,commentary 2020 proyecto garnachas salvaje del...
2,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Wine & Spirits,96.0,spectacular gigondas wine red cherry flavors s...
3,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Decanter,94.0,straight first sniff clear going special soari...
4,https://www.wine.com/product/scott-harvey-moun...,Scott Harvey Mountain Selection Zinfandel 2019,Zinfandel,"from Amador, Sierra Foothills, California",Red Wine,4.3,39,fruit forward rich full flavors expressing var...,Wine Enthusiast,93.0,fresh smelling full bodied flavor packed wine ...


In [13]:
df.columns

  and should_run_async(code)


Index(['product_url', 'product_name', 'product_variety', 'product_origin',
       'product_family', 'user_avg_rating', 'user_rating_count',
       'winemaker_description', 'reviewer_name', 'reviewer_rating',
       'reviewer_text'],
      dtype='object')

In [14]:
df.shape

  and should_run_async(code)


(20988, 11)

### Reduce to Relevant Data

In [15]:
review_data = df[['product_family', 'reviewer_rating', 'reviewer_text']]

  and should_run_async(code)


### Missing Data & Data Type Correction

In [16]:
review_data.isnull().sum()

  and should_run_async(code)


product_family        0
reviewer_rating    6451
reviewer_text      6494
dtype: int64

In [17]:
review_data = review_data.dropna(subset = ['reviewer_text'])

  and should_run_async(code)


In [18]:
review_data['reviewer_rating'] = review_data['reviewer_rating'].astype(int)

  and should_run_async(code)


In [19]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14494 entries, 0 to 20987
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   product_family   14494 non-null  object
 1   reviewer_rating  14494 non-null  int64 
 2   reviewer_text    14494 non-null  object
dtypes: int64(1), object(2)
memory usage: 452.9+ KB


  and should_run_async(code)


### Data Assignment & Splitting

In [20]:
# specifying predictive and target features
X = review_data.drop(columns = ['reviewer_rating'])
y = review_data[['product_family']].values.ravel()

  and should_run_async(code)


In [21]:
# create holdout set to approximate real-world performance
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=123)

  and should_run_async(code)


spliting the data set into train and test with 20% of the data going to the test data set and 80% of the data used for training 

## Naive Bayes

In [22]:
class CovertToList(TransformerMixin):
    def transform(self, X):
        transformed_data = []
        #transform to dataframe
        X = pd.DataFrame(X)
        #get colnames
        colnames = X.columns
        #iterate through columns
        for col in colnames:
            X = X[col].tolist()
            X = [str(i) for i in X]
            transformed_data.extend(X)
        return np.array(transformed_data)

    def fit(self, X, y=None):
        return self

  and should_run_async(code)


In [23]:
# specifiying column transformer fields
text_variables = ['reviewer_text']

# Count Vectorizer pipeline:
cv_transformer = Pipeline([('convert_to_list', CovertToList()),
                           ('count_vectorizer', CountVectorizer())])

nb_full_pipeline = Pipeline([('column_transformer', ColumnTransformer([('text', cv_transformer, text_variables)],
                                                                      remainder = 'drop')),
                             ('near_zero_variance', VarianceThreshold()),
                             ('naive_bayes', MultinomialNB())])

  and should_run_async(code)


## Hyperparameter tuning Naive Bayes

In [31]:
search_space = [{'naive_bayes__alpha': uniform(0.001, 10.0),
                 'column_transformer__text__count_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)]}]


cv_nb = RandomizedSearchCV(nb_full_pipeline,
                            param_distributions = search_space, 
                            n_iter = 10, 
                            cv = 5,
                            n_jobs = 6,
                            scoring = 'accuracy',
                            random_state=123)

cv_nb.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % cv_nb.best_score_)
print(cv_nb.best_params_)

  and should_run_async(code)


Best parameter (CV score=0.897):
{'column_transformer__text__count_vectorizer__ngram_range': (2, 2), 'naive_bayes__alpha': 0.5977789660956835}


The best parameter for the naive bayes model are alpha value of 0.5977789660956835 and a ngram range of (2, 2)

### Write Pipeline to File

In [25]:
cv_nb_best_pipeline = cv_nb.best_estimator_
with open(models_folder + 'topic_nb_best_cv.pkl', 'wb') as f:
    dill.dump(cv_nb_best_pipeline, f)

  and should_run_async(code)


### Holdout Performance

In [26]:
accuracy_score(y_test, cv_nb.predict(X_test))

  and should_run_async(code)


0.9061745429458434

the Navie Bayes model was able to get a accuracy score of 0.91

## LDA - TF-IDF Vectorizer

In [27]:
# TF-IDF  pipeline:
tf_transformer = Pipeline([('convert_to_list', CovertToList()),
                           ('tfidf_vectorizer', TfidfVectorizer())])

tf_full_pipeline = Pipeline([('column_transformer', ColumnTransformer([('text', tf_transformer, text_variables)],
                                                                      remainder = 'drop')),
                             ('near_zero_variance', VarianceThreshold()),
                             ('naive_bayes', MultinomialNB())])

  and should_run_async(code)


## Hyperparameter tuning of LDA

In [28]:
search_space = [{'naive_bayes__alpha': uniform(0.001, 10.0),
                 'column_transformer__text__tfidf_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)]}]

tf_nb = RandomizedSearchCV(tf_full_pipeline,
                            param_distributions = search_space, 
                            n_iter = 10, 
                            cv = 5,
                            n_jobs = 6,
                            scoring = 'accuracy',
                            random_state=123)
tf_nb.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % tf_nb.best_score_)
print(tf_nb.best_params_)

  and should_run_async(code)


Best parameter (CV score=0.728):
{'column_transformer__text__tfidf_vectorizer__ngram_range': (2, 2), 'naive_bayes__alpha': 0.5977789660956835}


The best paramters for the navie bayes using LDA are ngram_range of (2,2) and alpha value of 0.5977789660956835

In [29]:
tf_nb_best_pipeline = tf_nb.best_estimator_
with open(models_folder + 'topic_nb_best_tfidf.pkl', 'wb') as f:
    dill.dump(tf_nb_best_pipeline, f)

  and should_run_async(code)


### Holdout Performance

In [30]:
accuracy_score(y_test, tf_nb.predict(X_test))

  and should_run_async(code)


0.7419799931010693

the accuracy of the naive bayes model using LDA is 0.74 

Both model performed well, but the naives model with out LDA had a greater accuracy of 0.91 and with the LDA it had a accuracy of 0.74, which is not bad and might be better for real world appication due to the other model being over fitted 