#### Notebook Summary
This notebook contains exploration of the various engineered features to understand whether these behavioral summary features have relationships with our target variable. This was part of our initial feature engineering and exploration. 

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics

In [2]:
columns = ["user_id", "product_id", "date", "full_review", "rating", "label", "word_list",
           "lemmatized_word_list", "num_words", "num_verbs", "avg_word_len", "emotiveness_ratio",
           "num_positive_words", "num_negative_words", "sentiment"]
preprocessed_data = pd.read_csv("../../data/preprocessing and features for modeling/review_features_02.txt",
                                delimiter="\t",
                                names=columns)

In [3]:
# Add count of previous reviews before the current onw
preprocessed_data['previous_review_count'] = preprocessed_data.sort_values(['user_id', 'date']).groupby(['user_id']).cumcount()

In [4]:
# Add total amount of reviews and avg sentiment by this reviewer
user_id_counts = preprocessed_data[['user_id', 'sentiment']].groupby('user_id').agg({'sentiment': ['mean', 'count']})
user_id_counts.columns = ['avg_user_sentiment', 'total_user_reviews']
preprocessed_data = preprocessed_data.merge(user_id_counts, on='user_id')

In [5]:
# Add total amount of reviews and avg sentiment for this business
product_id_counts = preprocessed_data[['product_id', 'sentiment']].groupby('product_id').agg({'sentiment': ['mean', 'count']})
product_id_counts.columns = ['avg_business_sentiment', 'total_business_reviews']
preprocessed_data = preprocessed_data.merge(product_id_counts, on='product_id')

In [6]:
preprocessed_data

Unnamed: 0,user_id,product_id,date,full_review,rating,label,word_list,lemmatized_word_list,num_words,num_verbs,avg_word_len,emotiveness_ratio,num_positive_words,num_negative_words,sentiment,previous_review_count,avg_user_sentiment,total_user_reviews,avg_business_sentiment,total_business_reviews
0,5044,0,2014-11-16,"Drinks were bad, the hot chocolate was watered...",1.0,-1,"['drinks', 'bad', 'hot', 'chocolate', 'watered...","['drink', 'bad', 'hot', 'chocolate', 'water', ...",17,2,4.882353,0.416667,1,5,-0.235294,0,-0.235294,1,0.123241,88
1,5045,0,2014-09-08,This was the worst experience I've ever had a ...,1.0,-1,"['worst', 'experience', 'ive', 'ever', 'casual...","['bad', 'experience', 'ive', 'ever', 'casual',...",118,21,5.533898,0.430380,4,12,-0.067797,0,-0.067797,1,0.123241,88
2,5046,0,2013-10-06,This is located on the site of the old Spruce ...,3.0,-1,"['located', 'site', 'old', 'spruce', 'st', 'vi...","['locate', 'site', 'old', 'spruce', 'st', 'vid...",24,3,5.125000,0.500000,4,1,0.125000,2,0.062500,4,0.123241,88
3,5047,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,5.0,-1,"['enjoyed', 'coffee', 'breakfast', 'twice', 't...","['enjoy', 'coffee', 'breakfast', 'twice', 'toa...",129,15,5.651163,0.476744,20,4,0.124031,0,0.124031,1,0.123241,88
4,5048,0,2014-08-28,I love Toast! The food choices are fantastic -...,5.0,-1,"['love', 'toast', 'food', 'choices', 'fantasti...","['love', 'toast', 'food', 'choice', 'fantastic...",79,11,5.354430,0.500000,12,0,0.151899,0,0.151899,1,0.123241,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608593,259494,4877,2014-12-10,This is a new restaurant in Newark internation...,1.0,1,"['new', 'restaurant', 'newark', 'international...","['new', 'restaurant', 'newark', 'international...",42,8,4.976190,0.428571,7,1,0.142857,0,0.142857,1,0.142857,1
608594,260401,4176,2011-06-07,"If you appreciate a good sub roll, Slack's has...",5.0,1,"['appreciate', 'good', 'sub', 'roll', 'slacks'...","['appreciate', 'good', 'sub', 'roll', 'slack',...",16,2,4.937500,0.454545,7,1,0.375000,0,0.375000,1,0.277244,2
608595,260402,4176,2008-07-14,Best Philly Hoagies and Cheesesteaks on this s...,5.0,1,"['best', 'philly', 'hoagies', 'cheesesteaks', ...","['best', 'philly', 'hoagy', 'cheesesteaks', 's...",39,2,5.384615,0.583333,7,0,0.179487,0,0.179487,1,0.277244,2
608596,265185,2984,2014-12-18,The food here is amazing! Authentic Haitian ...,5.0,1,"['food', 'amazing', 'authentic', 'haitian', 'c...","['food', 'amaze', 'authentic', 'haitian', 'cui...",30,3,6.233333,0.764706,7,0,0.233333,0,0.233333,1,0.150758,2


In [7]:
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [9]:
orig_features = ['num_words', 'num_verbs', 'avg_word_len', 'emotiveness_ratio',
                 'num_positive_words', 'num_negative_words', 'sentiment', 'rating']
new_features = ['previous_review_count', 'avg_user_sentiment', 'total_user_reviews',
                'avg_business_sentiment', 'total_business_reviews']
label_col = 'label'

In [10]:
%%time
# Baseline model without any direct representation of words or word embeddings

# Using a linear kernel for now as I have seen pretty good results for linear SVM with text classification
# Using LinearSVC for better performance on large data sets
baseline_svm = LinearSVC()

# Best practice scale data when using geometric based models like SVM where magnitude is important
scaler = StandardScaler()

# Best practice to split the data before scaling, so that the scaler is fit on only the training data
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data[orig_features + new_features],
                                                    preprocessed_data[label_col].values,
                                                    test_size=.3,
                                                    random_state=24)

# Minority class is significantly less than majority class, so we should use SMOTE to upsample the data
resampler = SMOTE(random_state=24, k_neighbors=3)

# Using imblearn pipeline to manage the scaling and resampling logic in a simple way
svm_pipe = Pipeline([('scaler', scaler),
                     ('upsampler', resampler),
                     ('svc', baseline_svm)])

svm_pipe.fit(X_train, y_train)

CPU times: total: 4min 27s
Wall time: 4min 29s




In [11]:
# All columns
preds = svm_pipe.predict(X_test)
print(metrics.classification_report(y_test, preds))
metrics.confusion_matrix(y_test, preds)

              precision    recall  f1-score   support

          -1       0.22      0.87      0.35     24121
           1       0.96      0.54      0.69    158459

    accuracy                           0.58    182580
   macro avg       0.59      0.70      0.52    182580
weighted avg       0.87      0.58      0.64    182580



array([[20995,  3126],
       [73677, 84782]], dtype=int64)

In [12]:
# Just old columns
svm_pipe.fit(X_train[orig_features], y_train)

Feature names unseen at fit time:
- avg_business_sentiment
- avg_user_sentiment
- previous_review_count
- total_business_reviews
- total_user_reviews



ValueError: X has 13 features, but StandardScaler is expecting 8 features as input.

In [13]:
preds = svm_pipe.predict(X_test[orig_features])
print(metrics.classification_report(y_test, preds))
metrics.confusion_matrix(y_test, preds)

              precision    recall  f1-score   support

          -1       0.18      0.67      0.28     24121
           1       0.91      0.53      0.67    158459

    accuracy                           0.55    182580
   macro avg       0.54      0.60      0.48    182580
weighted avg       0.82      0.55      0.62    182580



array([[16051,  8070],
       [74401, 84058]], dtype=int64)

The new columns added (avg_user_sentiment, avg_business_sentiment, total_user_reviews, total_business_reviews, previous_review_count) definitely appear to have some valuable information that helps in identifying the fake reviews. By including these features we improve both the precision and the recall. We still have an issue where we are over-identifying the true reviews as fake, but we are trending in the right direction. Let's try combining these with the TF-IDF representation and see if we can improve over the previous model.

In [None]:
sns.histplot()

In [15]:
from sklearn.compose import ColumnTransformer

tfidf_combined_svm = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data[orig_features + new_features + ['lemmatized_word_list']],
                                                    preprocessed_data[label_col].values,
                                                    test_size=.3,
                                                    random_state=24)

summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, orig_features + new_features),
                                      ('tfidf', tfidf_pipeline, 'lemmatized_word_list')])


tfidf_combined_svm_pipe = Pipeline([('preprocessing', column_processor),
                                    ('upsampler', resampler),
                                    ('svc', tfidf_combined_svm)])


print(X_train.shape)
print(X_test.shape)
tfidf_combined_svm_pipe

(426018, 14)
(182580, 14)


In [16]:
%%time
tfidf_combined_svm_pipe.fit(X_train, y_train)

CPU times: total: 11min 48s
Wall time: 11min 56s




In [17]:
tfidf_combined_preds = tfidf_combined_svm_pipe.predict(X_test)

print(metrics.classification_report(y_test, tfidf_combined_preds))
metrics.confusion_matrix(y_test, tfidf_combined_preds)

              precision    recall  f1-score   support

          -1       0.27      0.60      0.38     24121
           1       0.93      0.76      0.83    158459

    accuracy                           0.74    182580
   macro avg       0.60      0.68      0.60    182580
weighted avg       0.84      0.74      0.77    182580



array([[ 14545,   9576],
       [ 38514, 119945]], dtype=int64)

In [20]:
%%time
# Trying to use an alternative to SMOTE
tfidf_combined_svm = LinearSVC(class_weight='balanced')
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data[orig_features + new_features + ['lemmatized_word_list']],
                                                    preprocessed_data[label_col].values,
                                                    test_size=.3,
                                                    random_state=24)

summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, orig_features + new_features),
                                      ('tfidf', tfidf_pipeline, 'lemmatized_word_list')])


tfidf_combined_svm_pipe = Pipeline([('preprocessing', column_processor),
                                    ('svc', tfidf_combined_svm)])


print(X_train.shape)
print(X_test.shape)
tfidf_combined_svm_pipe.fit(X_train, y_train)

tfidf_combined_preds = tfidf_combined_svm_pipe.predict(X_test)

print(metrics.classification_report(y_test, tfidf_combined_preds))
metrics.confusion_matrix(y_test, tfidf_combined_preds)

(426018, 14)
(182580, 14)




              precision    recall  f1-score   support

          -1       0.27      0.64      0.38     24121
           1       0.93      0.74      0.82    158459

    accuracy                           0.72    182580
   macro avg       0.60      0.69      0.60    182580
weighted avg       0.84      0.72      0.76    182580

CPU times: total: 4min 37s
Wall time: 4min 40s


array([[ 15550,   8571],
       [ 41963, 116496]], dtype=int64)

In [28]:
punctuation = ['.', ',', '!', '?', ':', ';']


for p in punctuation:
    preprocessed_data[f'{p}_count'] = preprocessed_data.full_review.str.count(f'\\{p}')


In [31]:
punctuation_features = ['._count', ',_count', '!_count', '?_count', ':_count', ';_count']

In [32]:
%%time
# Trying to use an alternative to SMOTE with punctuation features
tfidf_combined_svm = LinearSVC(class_weight='balanced')
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data[orig_features + new_features + punctuation_features + ['lemmatized_word_list']],
                                                    preprocessed_data[label_col].values,
                                                    test_size=.3,
                                                    random_state=24)

summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, orig_features + new_features),
                                      ('tfidf', tfidf_pipeline, 'lemmatized_word_list')])


tfidf_combined_svm_pipe = Pipeline([('preprocessing', column_processor),
                                    ('svc', tfidf_combined_svm)])


print(X_train.shape)
print(X_test.shape)
tfidf_combined_svm_pipe.fit(X_train, y_train)

tfidf_combined_preds = tfidf_combined_svm_pipe.predict(X_test)

print(metrics.classification_report(y_test, tfidf_combined_preds))
metrics.confusion_matrix(y_test, tfidf_combined_preds)

(426018, 20)
(182580, 20)




              precision    recall  f1-score   support

          -1       0.27      0.65      0.38     24121
           1       0.93      0.73      0.82    158459

    accuracy                           0.72    182580
   macro avg       0.60      0.69      0.60    182580
weighted avg       0.84      0.72      0.76    182580

CPU times: total: 4min 36s
Wall time: 4min 40s


array([[ 15579,   8542],
       [ 42147, 116312]], dtype=int64)

In [34]:
preprocessed_data.columns

Index(['user_id', 'product_id', 'date', 'full_review', 'rating', 'label',
       'word_list', 'lemmatized_word_list', 'num_words', 'num_verbs',
       'avg_word_len', 'emotiveness_ratio', 'num_positive_words',
       'num_negative_words', 'sentiment', 'previous_review_count',
       'avg_user_sentiment', 'total_user_reviews', 'avg_business_sentiment',
       'total_business_reviews', '._count', ',_count', '!_count', '?_count',
       ':_count', ';_count'],
      dtype='object')

In [35]:
preprocessed_data

Unnamed: 0,user_id,product_id,date,full_review,rating,label,word_list,lemmatized_word_list,num_words,num_verbs,...,avg_user_sentiment,total_user_reviews,avg_business_sentiment,total_business_reviews,._count,",_count",!_count,?_count,:_count,;_count
0,5044,0,2014-11-16,"Drinks were bad, the hot chocolate was watered...",1.0,-1,"['drinks', 'bad', 'hot', 'chocolate', 'watered...","['drink', 'bad', 'hot', 'chocolate', 'water', ...",17,2,...,-0.235294,1,0.123241,88,2,3,0,0,0,0
1,5045,0,2014-09-08,This was the worst experience I've ever had a ...,1.0,-1,"['worst', 'experience', 'ive', 'ever', 'casual...","['bad', 'experience', 'ive', 'ever', 'casual',...",118,21,...,-0.067797,1,0.123241,88,15,9,0,1,0,0
2,5046,0,2013-10-06,This is located on the site of the old Spruce ...,3.0,-1,"['located', 'site', 'old', 'spruce', 'st', 'vi...","['locate', 'site', 'old', 'spruce', 'st', 'vid...",24,3,...,0.062500,4,0.123241,88,5,1,0,0,0,0
3,5047,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,5.0,-1,"['enjoyed', 'coffee', 'breakfast', 'twice', 't...","['enjoy', 'coffee', 'breakfast', 'twice', 'toa...",129,15,...,0.124031,1,0.123241,88,15,16,2,0,0,0
4,5048,0,2014-08-28,I love Toast! The food choices are fantastic -...,5.0,-1,"['love', 'toast', 'food', 'choices', 'fantasti...","['love', 'toast', 'food', 'choice', 'fantastic...",79,11,...,0.151899,1,0.123241,88,6,2,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608593,259494,4877,2014-12-10,This is a new restaurant in Newark internation...,1.0,1,"['new', 'restaurant', 'newark', 'international...","['new', 'restaurant', 'newark', 'international...",42,8,...,0.142857,1,0.142857,1,8,0,3,0,0,1
608594,260401,4176,2011-06-07,"If you appreciate a good sub roll, Slack's has...",5.0,1,"['appreciate', 'good', 'sub', 'roll', 'slacks'...","['appreciate', 'good', 'sub', 'roll', 'slack',...",16,2,...,0.375000,1,0.277244,2,3,1,0,0,0,0
608595,260402,4176,2008-07-14,Best Philly Hoagies and Cheesesteaks on this s...,5.0,1,"['best', 'philly', 'hoagies', 'cheesesteaks', ...","['best', 'philly', 'hoagy', 'cheesesteaks', 's...",39,2,...,0.179487,1,0.277244,2,8,3,2,0,0,0
608596,265185,2984,2014-12-18,The food here is amazing! Authentic Haitian ...,5.0,1,"['food', 'amazing', 'authentic', 'haitian', 'c...","['food', 'amaze', 'authentic', 'haitian', 'cui...",30,3,...,0.233333,1,0.150758,2,7,4,8,0,0,0
