In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [50]:
"""
Load data and setup train/test split
~91k train reviews, ~39k test
"""
reviews = pd.read_csv("Data/classifier_data.csv")
reviews.dropna(inplace=True)
train, test = train_test_split(reviews, test_size=0.30, random_state=12)
train_x = train['Review']
train_y = train['Real']
test_x = test['Review']
test_y = test['Real']

In [55]:
"""
Dummy Classifier to be used as a baseline for comparison
The accuracy is 50% which is expected as the data is 50% fake reviews
"""
from sklearn.dummy import DummyClassifier
baseline = DummyClassifier(strategy="stratified")
baseline.fit(train_x, train_y)

np.mean(baseline.predict(test_x) == test_y)*100

49.67675731144177

In [57]:
"""
Vectorize the reviews in order to use a classifier
tfidf helps reduce impact of common words
"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipe = Pipeline([('count_vect',CountVectorizer()),
                ('tfidf_transformer',TfidfTransformer()),
                ('nbClassifier',MultinomialNB())])
pipe.fit(train_x, train_y)

Pipeline(memory=None,
         steps=[('count_vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf_transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('nbClassifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [64]:
preds = pipe.predict(test_x)

In [65]:
print(classification_report(test_y, preds))


              precision    recall  f1-score   support

           0       0.64      0.51      0.57     19420
           1       0.60      0.71      0.65     19560

    accuracy                           0.61     38980
   macro avg       0.62      0.61      0.61     38980
weighted avg       0.62      0.61      0.61     38980



In [67]:
"""

"""


ModuleNotFoundError: No module named 'keras'

Unnamed: 0,Review,Real
3633,Charged me ridiculous charge too much for the ...,0
42146,Wow So delicious the perfect omelette and the ...,0
126599,Really like the place and the food be great Th...,1
52504,Best burger have ever have Quality come out to...,0
69374,Food be good but place be dirty do not go in t...,1
...,...,...
40315,I like bubble tea but I do not like their smoo...,0
19786,Tropical Avenue have be a bite disappoint The ...,0
124584,Why be you close today have Tuesday and have I...,1
104358,Definitely one of the well all you can eat sus...,1
