In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Data/fake reviews dataset.csv")

In [6]:
# Clean up the category names
old_cat = ['Home_and_Kitchen_5', 'Sports_and_Outdoors_5', 'Electronics_5',
       'Movies_and_TV_5', 'Tools_and_Home_Improvement_5',
       'Pet_Supplies_5', 'Kindle_Store_5', 'Books_5', 'Toys_and_Games_5',
       'Clothing_Shoes_and_Jewelry_5']
new_cat = ['Home and Kitchen', 'Sports and Outdoors', 'Electronics',
        'Movies and TV', 'Tools and Home Improvement',
        'Pet Supplies', 'Kindle Store', 'Books', 'Toys and Games',
        'Clothing Shoes and Jewelry']
df['category'].replace(to_replace=old_cat, value=new_cat, inplace=True)

# Remove any duplicate rows, then check for NA values for each column
df = df.drop_duplicates(subset='text_')
df['text_'].replace(".   .                   ", np.nan, inplace=True)
df.dropna(subset=['text_'], inplace=True)

print(df.isnull().sum())

# Rename columns so that they are more descriptive
colnames_dict = {"label": "type", "text_": "comments"}
df.rename(columns=colnames_dict, inplace=True)

# Map the type column to numeric/boolean values:
  # OR (Original reviews, presumably human-created and authentic) = 0
  # CG (Computer-generated fake reviews) = 1

type_dict_map = {'OR': 0 ,'CG':1}
df['type'] = df['type'].map(type_dict_map)
df.head()

category    0
rating      0
label       0
text_       0
dtype: int64


Unnamed: 0,category,rating,type,comments
0,Home and Kitchen,5.0,1,"Love this! Well made, sturdy, and very comfor..."
1,Home and Kitchen,5.0,1,"love it, a great upgrade from the original. I..."
2,Home and Kitchen,5.0,1,This pillow saved my back. I love the look and...
3,Home and Kitchen,1.0,1,"Missing information on how to use it, but it i..."
4,Home and Kitchen,5.0,1,Very nice set. Good quality. We have had the s...


In [7]:

target_var  = 'type'
feature_var = [ 'rating', 'comments']
X = df[feature_var]
y = df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=101)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(28287, 2)
(12124, 2)
(28287, 1)
(12124, 1)


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # TF-IDF vectorizer
    ('classifier', GaussianNB())    # Gaussian Naive Bayes classifier
])

# Fit the pipeline on the training data
X_train_tfidf = pipeline.named_steps['tfidf'].fit_transform(X_train['comments'])
pipeline.named_steps['classifier'].fit(X_train_tfidf.toarray(), y_train.ravel())

# Evaluate the model
X_test_tfidf = pipeline.named_steps['tfidf'].transform(X_test['comments'])
train_accuracy = pipeline.named_steps['classifier'].score(X_train_tfidf.toarray(), y_train)
test_accuracy = pipeline.named_steps['classifier'].score(X_test_tfidf.toarray(), y_test)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


Training Accuracy: 0.8698341994555803
Testing Accuracy: 0.7523094688221709


In [14]:
# Ask for user input
rating = int(input("Enter the rating (1-5): "))
comment = input("Enter the comment: ")

# Create a DataFrame with the user input
user_data = pd.DataFrame({'rating': [rating],
                          'comments': [comment]})

# Transform the user input using the TF-IDF vectorizer
user_data_tfidf = pipeline.named_steps['tfidf'].transform(user_data['comments'])

# Predict the output using the trained classifier
prediction = pipeline.named_steps['classifier'].predict(user_data_tfidf.toarray())

# Print the prediction
if prediction[0] == 0:
    print("Original review (OR)")
else:
    print("Computer-generated fake review (CG)")


Original review (OR)


<br>
<br>
<br>
<br>
<br>
Below are backup code that didn't really work


In [None]:
#Transform each comment row into tfidf array
df['comments'] = df['comments'].astype(str)

vectorizer = TfidfVectorizer()
# print(df['comments'])
for index, row in df.iterrows():
    # Transform the comment into a TF-IDF array
    comment = row['comments']
    # Check if the comment is empty or contains only stop words
    

    # Transform the comment into a TF-IDF array
    comment_tfidf = vectorizer.fit_transform([comment]).toarray()
    # Replace the original comment with its TF-IDF array
    df.at[index, 'comments'] = comment_tfidf

# response = vectorizer.fit_transform(df['comments'])
# df['comments'] = response.getnnz()


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

# Initialize the Naive Bayes classifier
model = GaussianNB()
print("error1")
# Train the classifier
model.fit(X_train, y_train)
print("error2")

# Predictions on the training set
train_preds = model.predict(X_train)
print("error3")

# Predictions on the testing set
test_preds = model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, train_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
