In [1]:
!git clone https://github.com/congruiyS2023/GeneratedReviewsDetection.git

Cloning into 'GeneratedReviewsDetection'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 98 (delta 28), reused 8 (delta 0), pack-reused 0[K
Receiving objects: 100% (98/98), 26.82 MiB | 17.32 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [2]:
# preprocessing data for English Reviews

import pandas as pd
import gzip
import random
import json


def extract_reviews(input_file, num_reviews=500, min_overall_rating=4.0, max_overall_rating=5.0,  verified='TRUE'):
    reviews = []

    with gzip.open(input_file, 'rt', encoding='utf-8') as file:
        for line in file:
            review_data = json.loads(line)
            if review_data.get('overall', 0.0) >= min_overall_rating and review_data.get('overall', 0.0) <= max_overall_rating and review_data.get('verified', 'FALSE') == verified:
                reviews.append(review_data)
                if len(reviews) == num_reviews:
                    break

    return reviews

# Specify input files
input_file1 = 'GeneratedReviewsDetection/Data/English Reviews/Luxury_Beauty_5.json.gz'
input_file2 = 'GeneratedReviewsDetection/Data/English Reviews/Software_5.json.gz'
generated_reviews_file = 'GeneratedReviewsDetection/Data/English Reviews/generated_product_reviews.csv'


# Extract reviews
luxury_beauty_reviews_1 = extract_reviews(input_file1)
software_reviews_1 = extract_reviews(input_file2)

luxury_beauty_reviews_2 = extract_reviews(input_file1, num_reviews=500, min_overall_rating=0.0, max_overall_rating=3.0,  verified=True)
software_reviews_2 = extract_reviews(input_file2, num_reviews=500, min_overall_rating=0.0, max_overall_rating=3.0,  verified=True)
additional_reviews = pd.read_csv(generated_reviews_file).sample(n=2000, random_state=42)

# Combine all reviews
all_reviews = luxury_beauty_reviews_1 + luxury_beauty_reviews_2 + software_reviews_1 + software_reviews_2
combined_reviews = []

for review in all_reviews:
    review_text = review.get('reviewText', '')
    combined_reviews.append({'review': review_text, 'label': 0})

for review in additional_reviews['Review']:
    combined_reviews.append({'review': review, 'label': 1})

# Shuffle the combined reviews
random.shuffle(combined_reviews)

# Write the combined reviews to a CSV file
csv_file = 'labeled_english_reviews.csv'
pd.DataFrame(combined_reviews).to_csv(csv_file, index=False)

print(f"Shuffled reviews with additional reviews are written to {csv_file}")

Shuffled reviews with additional reviews are written to labeled_english_reviews.csv


In [4]:
file_path = 'GeneratedReviewsDetection/Data/Chinese Reviews/original_test_chinese_reviews.csv'
generated_chinese_reviews_file = 'GeneratedReviewsDetection/Data/Chinese Reviews/generatedReviews.csv'

all_reviews = pd.read_csv(file_path).sample(n=2000, random_state=42)
additional_reviews = pd.read_csv(generated_chinese_reviews_file).sample(n=2000, random_state=42)

combined_reviews = []

for review in all_reviews['review']:
    combined_reviews.append({'review': review, 'label': 0})

for review in additional_reviews['review']:
    combined_reviews.append({'review': review, 'label': 1})

# Shuffle the combined reviews
random.shuffle(combined_reviews)

# Write the combined reviews to a CSV file
csv_file = 'labeled_chinese_reviews.csv'
pd.DataFrame(combined_reviews).to_csv(csv_file, index=False)

print(f"Shuffled reviews with additional reviews are written to {csv_file}")

Shuffled reviews with additional reviews are written to labeled_chinese_reviews.csv
