In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics


from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../../data/yelp_training/yelp_zip_processed_features.csv', sep='\t')

In [3]:
df

Unnamed: 0,user_id,business_id,rating,num_word,num_noun,num_verb,num_adj,num_adv,num_personal_pronoun,avg_word_len,...,user_content_similarity,positive_reviews,negative_reviews,positive_review_ratio,negative_review_ratio,avg_business_sentiment,avg_business_rating,total_business_reviews,lemma,label
0,5044,0,1.0,35,9,2,5,2,1,5.000000,...,0.000000e+00,0.0,1.0,0.00,1.0,0.141963,3.613636,88,drink bad hot chocolate water latte burn taste...,-1
1,5045,0,1.0,241,54,31,16,15,7,5.561905,...,0.000000e+00,0.0,1.0,0.00,1.0,0.141963,3.613636,88,bad experience casual coffee light fare place ...,-1
2,5046,0,3.0,49,11,3,9,2,1,5.130435,...,4.074681e-02,1.0,0.0,0.25,0.0,0.141963,3.613636,88,locate site old spruce st video mild cofee goo...,-1
3,5047,0,5.0,216,56,33,19,14,5,5.941176,...,4.440892e-16,1.0,0.0,1.00,0.0,0.141963,3.613636,88,enjoy coffee breakfast twice toast recent visi...,-1
4,5048,0,5.0,146,31,23,15,9,8,5.507246,...,0.000000e+00,1.0,0.0,1.00,0.0,0.141963,3.613636,88,love toast food choice fantastic love serve br...,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608593,259494,4877,1.0,84,15,13,6,3,9,5.162162,...,2.220446e-16,0.0,1.0,0.00,1.0,0.162162,1.000000,1,new restaurant newark international airport ga...,1
608594,260401,4176,5.0,28,7,4,4,2,1,5.400000,...,2.220446e-16,1.0,0.0,1.00,0.0,0.204762,5.000000,2,appreciate good sub roll slack bestreet good s...,1
608595,260402,4176,5.0,66,14,8,9,2,3,5.514286,...,2.220446e-16,1.0,0.0,1.00,0.0,0.204762,5.000000,2,good philly hoagie cheesesteak delaware river ...,1
608596,265185,2984,5.0,47,14,6,8,4,3,6.392857,...,0.000000e+00,1.0,0.0,1.00,0.0,0.160714,5.000000,2,food amazing authentic haitian cuisine absolut...,1


In [4]:
numeric_feature_columns = ['rating', 'num_word',
                           'num_noun', 'num_verb', 'num_adj', 'num_adv', 'num_personal_pronoun',
                           'avg_word_len', 'lexical_diversity', 'sentiment', 'typo_ratio',
                           'emotiveness_ratio', 'num_positive_words', 'num_negative_words',
                           'num_clauses', 'previous_user_reviews', 'avg_user_sentiment',
                           'avg_user_rating', 'total_user_reviews', 'user_content_similarity',
                           'positive_reviews', 'negative_reviews', 'positive_review_ratio',
                           'negative_review_ratio', 'avg_business_sentiment',
                           'avg_business_rating', 'total_business_reviews']
text_feature_columns = ['lemma']
id_columns = ['user_id', 'business_id']

In [10]:
(df['lemma'].isna()).any()

True

In [15]:
null_lemma = df['lemma'].isna()
filtered_df = df[~null_lemma]

tfidf_combined_svm = LinearSVC(max_iter=2000)
X_train, X_test, y_train, y_test = train_test_split(filtered_df[numeric_feature_columns + text_feature_columns],
                                                    filtered_df['label'].values,
                                                    test_size=.3,
                                                    random_state=24)
resampler = SMOTE(random_state=24, k_neighbors=3)
summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, numeric_feature_columns),
                                      ('tfidf', tfidf_pipeline, 'lemma')])


tfidf_combined_svm_pipe = Pipeline([('preprocessing', column_processor),
                                    ('upsampler', resampler),
                                    ('svc', tfidf_combined_svm)])


print(X_train.shape)
print(X_test.shape)
tfidf_combined_svm_pipe

(425924, 28)
(182539, 28)


In [12]:
%%time
tfidf_combined_svm_pipe.fit(X_train, y_train)

CPU times: total: 13min 7s
Wall time: 13min 10s




In [13]:
%%time
tfidf_combined_preds = tfidf_combined_svm_pipe.predict(X_test)

print(metrics.classification_report(y_test, tfidf_combined_preds))
metrics.confusion_matrix(y_test, tfidf_combined_preds)

              precision    recall  f1-score   support

          -1       0.28      0.63      0.38     24077
           1       0.93      0.75      0.83    158462

    accuracy                           0.73    182539
   macro avg       0.60      0.69      0.61    182539
weighted avg       0.84      0.73      0.77    182539

CPU times: total: 7.77 s
Wall time: 7.79 s


array([[ 15148,   8929],
       [ 39627, 118835]], dtype=int64)

In [17]:
%%time
# Retraining the model with all of our training data, and then saving the model
tfidf_combined_svm_pipe.fit(filtered_df[numeric_feature_columns + text_feature_columns], filtered_df['label'].values)

CPU times: total: 31min 51s
Wall time: 32min 6s




In [2]:
import joblib

In [19]:
joblib.dump(tfidf_combined_svm_pipe, 'svm_baseline.pkl')

['svm_baseline.pkl']

In [4]:
df = pd.read_csv('../../data/yelp_all/filtered_preprocessed_dataset.tsv', sep='\t')
model = joblib.load('svm_baseline.pkl')

In [5]:
numeric_feature_columns = ['rating', 'num_word',
                           'num_noun', 'num_verb', 'num_adj', 'num_adv', 'num_personal_pronoun',
                           'avg_word_len', 'lexical_diversity', 'sentiment', 'typo_ratio',
                           'emotiveness_ratio', 'num_positive_words', 'num_negative_words',
                           'num_clauses', 'previous_user_reviews', 'avg_user_sentiment',
                           'avg_user_rating', 'total_user_reviews', 'user_content_similarity',
                           'positive_reviews', 'negative_reviews', 'positive_review_ratio',
                           'negative_review_ratio', 'avg_business_sentiment',
                           'avg_business_rating', 'total_business_reviews']
text_feature_columns = ['lemma']
id_columns = ['user_id', 'business_id']

In [9]:
null_lemma = df['lemma'].isna()
filtered_df = df[~null_lemma]
all_predictions = model.predict(filtered_df[numeric_feature_columns + text_feature_columns])

In [12]:
predicted_reviews = pd.DataFrame({'review_id': df[~null_lemma]['review_id'], 'review_label': all_predictions})
predicted_reviews

Unnamed: 0,review_id,review_label
0,KU_O5udG6zpxOg-VcAEodg,1
1,jHmqmoEI-78BGHFJaDKlhQ,1
2,vwIXZHod-jQmGFvx0wCqSg,1
3,SP32nOhRm-KRAjYMPgf_MQ,1
4,fvu5n5shkAJDbQjulKNuqw,1
...,...,...
6990275,HjdD3aJDTJh11JcUh3HbuQ,-1
6990276,fazoFJC-mT7LhEQ47M2dfw,-1
6990277,SxburYok8OeDoHFRFoDcyA,-1
6990278,Flk8ITfq6Qhtb0JdPzE3Sg,-1


In [17]:
predicted_reviews.to_csv('../../data/yelp_all/reviews_with_predicted_label_baselinesvm.csv', sep=',', index=False)

In [14]:
predicted_reviews.review_label.value_counts() / all_predictions.shape[0]

 1    0.707642
-1    0.292358
Name: review_label, dtype: float64

The model is predicting that roughly 30% of the reviews are fake. This is in-line with our test results, so it is performing consistently and that is a good diagnostic check. However, the true fake review ratio is around 12% in our training set. It is likely, though unverifiable, that the true fake review rate is also lower on this full academic dataset as well, so this is an important problem that we can continue to try to improve upon in our model.