In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Data/fake reviews dataset.csv")

In [3]:
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [4]:
df.shape

(40432, 4)

In [5]:
df['category'].unique()

array(['Home_and_Kitchen_5', 'Sports_and_Outdoors_5', 'Electronics_5',
       'Movies_and_TV_5', 'Tools_and_Home_Improvement_5',
       'Pet_Supplies_5', 'Kindle_Store_5', 'Books_5', 'Toys_and_Games_5',
       'Clothing_Shoes_and_Jewelry_5'], dtype=object)

In [6]:
# Clean up the category names
old_cat = ['Home_and_Kitchen_5', 'Sports_and_Outdoors_5', 'Electronics_5',
       'Movies_and_TV_5', 'Tools_and_Home_Improvement_5',
       'Pet_Supplies_5', 'Kindle_Store_5', 'Books_5', 'Toys_and_Games_5',
       'Clothing_Shoes_and_Jewelry_5']
new_cat = ['Home and Kitchen', 'Sports and Outdoors', 'Electronics',
        'Movies and TV', 'Tools and Home Improvement',
        'Pet Supplies', 'Kindle Store', 'Books', 'Toys and Games',
        'Clothing Shoes and Jewelry']
df['category'].replace(to_replace=old_cat, value=new_cat, inplace=True)

In [7]:
df['label'].unique()

array(['CG', 'OR'], dtype=object)

In [8]:
# Remove any duplicate rows, then check for NA values for each column
df = df.drop_duplicates(subset='text_')
df['text_'].replace(".   .                   ", np.nan, inplace=True)
df.dropna(subset=['text_'], inplace=True)

print(df.isnull().sum())

category    0
rating      0
label       0
text_       0
dtype: int64


In [9]:
print(df.iloc[37895])


category                           Clothing Shoes and Jewelry
rating                                                    3.0
label                                                      CG
text_       I have a regular boyfriend and he likes the lo...
Name: 37915, dtype: object


In [10]:
# Rename columns so that they are more descriptive
colnames_dict = {"label": "type", "text_": "comments"}
df.rename(columns=colnames_dict, inplace=True)

df.head()

Unnamed: 0,category,rating,type,comments
0,Home and Kitchen,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home and Kitchen,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home and Kitchen,5.0,CG,This pillow saved my back. I love the look and...
3,Home and Kitchen,1.0,CG,"Missing information on how to use it, but it i..."
4,Home and Kitchen,5.0,CG,Very nice set. Good quality. We have had the s...


In [11]:
# Distribution of Type column
df['type'].value_counts()

OR    20215
CG    20196
Name: type, dtype: int64

In [12]:
# Distribution of Category column
df['category'].value_counts()

Kindle Store                  4727
Books                         4369
Pet Supplies                  4251
Home and Kitchen              4055
Electronics                   3988
Sports and Outdoors           3943
Tools and Home Improvement    3858
Clothing Shoes and Jewelry    3844
Toys and Games                3791
Movies and TV                 3585
Name: category, dtype: int64

In [13]:
# Map the type column to numeric/boolean values:
  # OR (Original reviews, presumably human-created and authentic) = 0
  # CG (Computer-generated fake reviews) = 1

type_dict_map = {'OR': 0 ,'CG':1}
df['type'] = df['type'].map(type_dict_map)

In [14]:
df_dummies = pd.get_dummies(df['category'])

df = pd.concat([df, df_dummies], axis=1)
print(df.columns)

Index(['category', 'rating', 'type', 'comments', 'Books',
       'Clothing Shoes and Jewelry', 'Electronics', 'Home and Kitchen',
       'Kindle Store', 'Movies and TV', 'Pet Supplies', 'Sports and Outdoors',
       'Tools and Home Improvement', 'Toys and Games'],
      dtype='object')


In [15]:
df.head()
print(df.iloc[37900])


category                                             Clothing Shoes and Jewelry
rating                                                                      1.0
type                                                                          1
comments                      I paid for this one and it arrived on time. I ...
Books                                                                         0
Clothing Shoes and Jewelry                                                    1
Electronics                                                                   0
Home and Kitchen                                                              0
Kindle Store                                                                  0
Movies and TV                                                                 0
Pet Supplies                                                                  0
Sports and Outdoors                                                           0
Tools and Home Improvement              

In [16]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

#Transform
df['comments'] = df['comments'].astype(str)

vectorizer = TfidfVectorizer()
# print(df['comments'])
for index, row in df.iterrows():
    # Transform the comment into a TF-IDF array
    comment = row['comments']
    # Check if the comment is empty or contains only stop words
    

    # Transform the comment into a TF-IDF array
    comment_tfidf = vectorizer.fit_transform([comment]).toarray()
    # Replace the original comment with its TF-IDF array
    df.at[index, 'comments'] = comment_tfidf

# response = vectorizer.fit_transform(df['comments'])
# df['comments'] = response.getnnz()


In [17]:
print(df['comments'])

0        [[0.25, 0.25, 0.25, 0.5, 0.25, 0.25, 0.25, 0.2...
1        [[0.2672612419124244, 0.2672612419124244, 0.26...
2        [[0.24253562503633297, 0.24253562503633297, 0....
3        [[0.24253562503633297, 0.24253562503633297, 0....
4        [[0.21320071635561041, 0.21320071635561041, 0....
                               ...                        
40427    [[0.05968155326844538, 0.02984077663422269, 0....
40428    [[0.028513297425610054, 0.028513297425610054, ...
40429    [[0.03140371465106639, 0.03140371465106639, 0....
40430    [[0.026046613053531162, 0.026046613053531162, ...
40431    [[0.025691749776935398, 0.025691749776935398, ...
Name: comments, Length: 40411, dtype: object


In [None]:
# Iterate over the DataFrame
for index, row in df.iterrows():
    # Check the type of the comments
    if isinstance(row['comments'], str):
        # Print the index and the row with untransformed comments
        print(f"First row with untransformed comments found at index {index}")
        print(row)
        break


In [31]:

target_var  = 'type'
feature_var = [ 'rating', 'comments', 'Books',
       'Clothing Shoes and Jewelry', 'Electronics', 'Home and Kitchen',
       'Kindle Store', 'Movies and TV', 'Pet Supplies', 'Sports and Outdoors',
       'Tools and Home Improvement', 'Toys and Games']
X = df[feature_var]
y = df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=101)


In [32]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(28287, 12)
(12124, 12)
(28287,)
(12124,)


In [29]:
print(X_train.head(1))

                                               comments
7569  [[0.05590169943749474, 0.05590169943749474, 0....


In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()
print("error1")
# Train the classifier
nb_classifier.fit(X_train, y_train)
print("error2")

# Predictions on the training set
train_preds = nb_classifier.predict(X_train)
print("error3")

# Predictions on the testing set
test_preds = nb_classifier.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, train_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


error1


ValueError: setting an array element with a sequence.

In [None]:
# Sample
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]
x_vectorizer = TfidfVectorizer()
x_transformed = x_vectorizer.fit_transform(corpus)
print(x_vectorizer.vocabulary_)


In [None]:
all_feature_names = x_vectorizer.get_feature_names_out()
index = x_vectorizer.vocabulary_.get('thor') # Get index of the word 'thor'
print(index) 
x_vectorizer.idf_[index] # Get IDF score for the word 'thor'

In [None]:
for word in all_feature_names:
    index = x_vectorizer.vocabulary_.get(word)
    print(f"Word: {word}, IDF: {x_vectorizer.idf_[index]}")

In [None]:
x_transformed.toarray()[:2]

In [None]:
# Raina Update

## Transform the data into a matrix of TF-IDF features
vectorizer2 = TfidfVectorizer(max_features=5000)
x_transformed_output = vectorizer2.fit_transform(df['comments'])

## Print the transformed output (output => 'feature name': index)
print(vectorizer2.vocabulary_)

In [None]:
#Chris Update
vectorizer = TfidfVectorizer(max_features=5000)
response = vectorizer.fit_transform(df['comments'])
print(len(vectorizer.vocabulary_))
print(vectorizer.vocabulary_)
print(response)

In [None]:
feature_var = ['category', 'rating']
sns.pairplot(df[feature_var])