# Sentiment Analysis & Modeling

In [1]:
import pandas as pd
import numpy as np
import os
import re
import dill
from scipy.stats import uniform, randint
from string import punctuation
import nltk

# preprocessing packages
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# pipeline tools
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

#feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#models
from nltk.sentiment import SentimentIntensityAnalyzer
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import make_scorer, mean_squared_error

#custom classes
from preprocessing import preprocess_text

In [2]:
#nltk.download()

In [3]:
#directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
processed_data_folder = parent_directory + '/data/wine-com/processed/'
models_folder = parent_directory + '/models/'

### Load Data

In [4]:
df = pd.read_csv(processed_data_folder + '1677432096.083379.txt', 
                 sep = '|')

In [5]:
df.head()

Unnamed: 0,product_url,product_name,product_variety,product_origin,product_family,user_avg_rating,user_rating_count,winemaker_description,reviewer_name,reviewer_rating,reviewer_text
0,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Decanter,92.0,part proyecto garnachas de españa collection s...
1,https://www.wine.com/product/proyecto-salvaje-...,Proyecto Salvaje del Moncayo Garnacha 2020,Grenache,"from Navarra, Spain",Red Wine,4.8,19,bright burgundy wine medium depth tobacco wild...,Wilfred Wong of Wine.com,91.0,commentary 2020 proyecto garnachas salvaje del...
2,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Wine & Spirits,96.0,spectacular gigondas wine red cherry flavors s...
3,https://www.wine.com/product/domaine-du-terme-...,Domaine du Terme Gigondas 2019,Rhone Red Blends,"from Gigondas, Rhone, France",Red Wine,4.0,17,,Decanter,94.0,straight first sniff clear going special soari...
4,https://www.wine.com/product/scott-harvey-moun...,Scott Harvey Mountain Selection Zinfandel 2019,Zinfandel,"from Amador, Sierra Foothills, California",Red Wine,4.3,39,fruit forward rich full flavors expressing var...,Wine Enthusiast,93.0,fresh smelling full bodied flavor packed wine ...


In [6]:
df.columns

Index(['product_url', 'product_name', 'product_variety', 'product_origin',
       'product_family', 'user_avg_rating', 'user_rating_count',
       'winemaker_description', 'reviewer_name', 'reviewer_rating',
       'reviewer_text'],
      dtype='object')

In [7]:
df.shape

(20988, 11)

### Reduce to Relevant Data

In [8]:
review_data = df[['product_family', 'reviewer_rating', 'reviewer_text']]

### Missing Data & Data Type Correction

In [9]:
review_data.isnull().sum()

product_family        0
reviewer_rating    6451
reviewer_text      6494
dtype: int64

In [10]:
review_data = review_data.dropna(subset = ['reviewer_text'])

In [11]:
review_data['reviewer_rating'] = review_data['reviewer_rating'].astype(int)

In [12]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14494 entries, 0 to 20987
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   product_family   14494 non-null  object
 1   reviewer_rating  14494 non-null  int32 
 2   reviewer_text    14494 non-null  object
dtypes: int32(1), object(2)
memory usage: 396.3+ KB


### Data Assignment & Splitting

In [13]:
# specifying predictive and target features
X = review_data.drop(columns = ['reviewer_rating'])
y = review_data[['reviewer_rating']]

In [14]:
# create holdout set to approximate real-world performance
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=123)

## Rule-based Approach

### NLTK - VADER

The first approach this analysis implements is a rules based approach using VADER and NLTK's SentimentIntensityAnalyzer. Given that this approach does not require labeled training data, it provides a useful baseline for comparison for other models.

In [19]:
class SIATransformer(TransformerMixin):
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()
        self.sia_transformer_lambda = lambda x: self.sia.polarity_scores(x)['compound']
        
    def transform(self, X):
        return np.vectorize(self.sia_transformer_lambda)(X)

    def fit(self, X, y=None):
        return self
    
class SIAScaler(TransformerMixin):
    def __init__(self):
        self.sia_scaler_lambda = lambda x: (x - (-1))*100/(2)
    
    def transform(self, X):
        return np.vectorize(self.sia_scaler_lambda)(X)

    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.vectorize(self.sia_scaler_lambda)(X)

In [27]:
# specifiying column transformer fields
text_variables = ['reviewer_text']

# text pipeline:
text_transformer = Pipeline([('text_cleaner', preprocess_text())])

sia_pipeline = Pipeline([('column_transformer', ColumnTransformer([('text', text_transformer, text_variables)],
                                                                   remainder = 'drop')),
                         ('sia_transformer', SIATransformer()), 
                         ('sia_scaler', SIAScaler())])

sia_pipeline.fit(X_train, y_train)

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(transformers=[('text',
                                                  Pipeline(steps=[('text_cleaner',
                                                                   <preprocessing.preprocess_text object at 0x000001FC78987A30>)]),
                                                  ['reviewer_text'])])),
                ('sia_transformer',
                 <__main__.SIATransformer object at 0x000001FC789874C0>),
                ('sia_scaler',
                 <__main__.SIAScaler object at 0x000001FC79186190>)])

Since the SIA model values are static, sentiment projection will be performed on entire available dataset.

### VADER Performance

In [28]:
np.sqrt(mean_squared_error(y_test, sia_pipeline.predict(X_test)))

16.85189886947674

### Write Pipeline to File

In [29]:
sia_model_path = models_folder + 'sia_pipeline.pkl'

with open(sia_model_path, 'wb') as f:
    dill.dump(sia_pipeline, f)

## Machine Learning Approach

### XGBoost - Count Vectorizer

In [30]:
class CovertToList(TransformerMixin):
    def transform(self, X):
        X = X.iloc[:, 0].tolist()
        X = [str(i) for i in X]
        return X

    def fit(self, X, y=None):
        return self

In [31]:
# specifiying column transformer fields
text_variables = ['reviewer_text']

# text pipeline:
text_transformer = Pipeline([('convert_to_list', CovertToList()),
                             ('count_vectorizer', CountVectorizer(ngram_range = (1, 2)))])

cv_full_pipeline = Pipeline([('column_transformer', ColumnTransformer([('text', text_transformer, text_variables)],
                                                                      remainder = 'drop')),
                             ('xgBoost', XGBRegressor(objective = 'reg:squarederror',
                                                      tree_method = 'gpu_hist'))])

In [33]:
search_space = [{'near_zero_variance__threshold': uniform(0.0, 0.2),
                 'xgBoost__n_estimators': randint(100, 2000),
                 'xgBoost__learning_rate': uniform(0.01, 0.3),
                 'xgBoost__max_depth': randint(3, 10),
                 'xgBoost__colsample_bytree': uniform(0.3, 0.7),
                 'xgBoost__gamma': uniform(2, 10),
                 'xgBoost__reg_alpha' : uniform(0.00001, 0.3),
                 'xgBoost__reg_lambda' : uniform(0.00001, 0.3)}]


cv_xgb = RandomizedSearchCV(cv_full_pipeline,
                            param_distributions = search_space, 
                            n_iter = 500, 
                            cv = 5,
                            n_jobs = 6,
                            scoring = 'neg_mean_squared_error',
                            random_state=123)

cv_xgb.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % cv_xgb.best_score_)
print(cv_xgb.best_params_)

Best parameter (CV score=-2.257):
{'near_zero_variance__threshold': 0.0003445833203715276, 'xgBoost__colsample_bytree': 0.882867360015585, 'xgBoost__gamma': 4.603327846714448, 'xgBoost__learning_rate': 0.11984504160563986, 'xgBoost__max_depth': 5, 'xgBoost__n_estimators': 1043, 'xgBoost__reg_alpha': 0.11224791868521704, 'xgBoost__reg_lambda': 0.14346660640816958}


### Write Pipeline to File

In [38]:
cv_xgb_best_pipeline = cv_xgb.best_estimator_
with open(models_folder + 'sentiment_xgb_best_cv.pkl', 'wb') as f:
    dill.dump(cv_xgb_best_pipeline, f)

NameError: name 'pickle' is not defined

### Holdout Performance

In [36]:
np.sqrt(mean_squared_error(y_test, cv_xgb.predict(X_test)))

1.4807609960520562

### XGBoost - TF-IDF Vectorizer

In [39]:
# text pipeline:
tf_text_transformer = Pipeline([('convert_to_list', CovertToList()),
                                ('tfidf_vectorizer', TfidfVectorizer(ngram_range = (1, 2)))])

tf_full_pipeline = Pipeline([('column_transformer', ColumnTransformer([('text', tf_text_transformer, text_variables)],
                                                                      remainder = 'drop')),
                             ('xgBoost', XGBRegressor(objective = 'reg:squarederror',
                                                      tree_method = 'gpu_hist'))])

In [None]:
search_space = [{'xgBoost__n_estimators': randint(100, 2000),
                 'xgBoost__learning_rate': uniform(0.01, 0.3),
                 'xgBoost__max_depth': randint(3, 10),
                 'xgBoost__colsample_bytree': uniform(0.3, 0.7),
                 'xgBoost__gamma': uniform(2, 10),
                 'xgBoost__reg_alpha' : uniform(0.00001, 0.3),
                 'xgBoost__reg_lambda' : uniform(0.00001, 0.3)}]

tf_xgb = RandomizedSearchCV(tf_full_pipeline,
                            param_distributions = search_space, 
                            n_iter = 500, 
                            cv = 5,
                            n_jobs = 6,
                            scoring = 'neg_mean_squared_error',
                            random_state=123)
tf_xgb.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % tf_xgb.best_score_)
print(tf_xgb.best_params_)



In [37]:
np.sqrt(mean_squared_error(y_test, tf_xgb.predict(X_test)))

2.1704606157795706e+17

In [None]:
tf_xgb_best_pipeline = tf_xgb.best_estimator_
with open(models_folder + 'sentiment_xgb_best_tfidf.pkl', 'wb') as f:
    dill.dump(tf_xgb_best_pipeline, f)