In [None]:
import json
import os

folder = "."
file_name = "product_meta_data_products_meta_Movies_and_TV.jsonl"
file_path = os.path.join(folder, file_name)

preprocessed_file_name = f"preprocessed_{file_name}"
preprocessed_file_path = os.path.join(folder, preprocessed_file_name)

def convert_to_string_if_json(value):
    try:
        if isinstance(value, str):
            json_obj = json.loads(value)
            if isinstance(json_obj, (list, dict)):
                return json.dumps(json_obj)
        return value
    except json.JSONDecodeError:
        return value

def is_valid_float(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

def preprocess_jsonl_file(file_path, preprocessed_file_path):
    os.makedirs(os.path.dirname(preprocessed_file_path), exist_ok=True)
    
    try:
        with open(file_path, 'r', encoding='utf-8') as input_file, open(preprocessed_file_path, 'w', encoding='utf-8') as output_file:
            for line in input_file:
                try:
                    json_obj = json.loads(line)
                    
                    for key in json_obj:
                        if isinstance(json_obj[key], list):
                            json_obj[key] = ", ".join(map(str, json_obj[key]))
                        elif json_obj[key] is not None:
                            json_obj[key] = str(json_obj[key])
                    
                    price = json_obj.get("price")
                    if price is None or (isinstance(price, str) and not is_valid_float(price)):
                        json_obj["price"] = None
                    elif isinstance(price, str) and is_valid_float(price):
                        json_obj["price"] = float(price)
                    
                    json_obj["images"] = convert_to_string_if_json(json_obj.get("images", ""))
                    json_obj["details"] = convert_to_string_if_json(json_obj.get("details", ""))
                    
                    output_file.write(json.dumps(json_obj) + "\n")
                
                except json.JSONDecodeError:
                    print(f"Skipped a malformed line: {line}")
        
        print(f"Preprocessing completed. Preprocessed file saved as: {preprocessed_file_path}")
    
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

preprocess_jsonl_file(file_path, preprocessed_file_path)


Preprocessing completed. Preprocessed file saved as: .\preprocessed_product_meta_data_products_meta_Movies_and_TV.jsonl


# REVIEWS

In [None]:
import os
import json

folder = r"C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data"

def convert_to_list_if_json(value):
    if isinstance(value, str):
        try:
            parsed_value = json.loads(value)
            if isinstance(parsed_value, list):
                return parsed_value
            else:
                return [parsed_value]
        except json.JSONDecodeError:
            return [value.strip()] if value.strip() else []
    return value if isinstance(value, list) else []

def fix_images_field(images_value):
    if isinstance(images_value, str):
        try:
            parsed_value = json.loads(images_value)
            if isinstance(parsed_value, dict):
                return [json.dumps(parsed_value)]
            elif isinstance(parsed_value, list):
                return [str(item) for item in parsed_value]
            else:
                return [images_value]
        except json.JSONDecodeError:
            return [images_value.strip()] if images_value.strip() else []
    elif isinstance(images_value, list):
        return [str(item) for item in images_value]
    elif isinstance(images_value, dict):
        return [json.dumps(images_value)]
    return []

def convert_to_string_if_json(value):
    if isinstance(value, dict):
        return json.dumps(value)
    if isinstance(value, str):
        try:
            json.loads(value)
            return value
        except json.JSONDecodeError:
            return str(value)
    return str(value)

def is_valid_float(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

def convert_to_boolean(value):
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        if value.lower() in ['true', 'yes', '1']:
            return True
        elif value.lower() in ['false', 'no', '0']:
            return False
    return False

def preprocess_json(file_path):
    preprocessed_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                record = json.loads(line)

                record["images"] = fix_images_field(record.get("images", "[]"))
                record["videos"] = convert_to_list_if_json(record.get("videos", "[]"))
                record["details"] = convert_to_string_if_json(record.get("details", "{}"))

                if 'timestamp' in record and isinstance(record['timestamp'], str):
                    try:
                        record['timestamp'] = int(record['timestamp'])
                    except ValueError:
                        record['timestamp'] = 0

                price = record.get("price")
                if price is None or (isinstance(price, str) and not is_valid_float(price)):
                    record["price"] = None
                elif isinstance(price, str) and is_valid_float(price):
                    record["price"] = float(price)

                record["verified_purchase"] = convert_to_boolean(record.get("verified_purchase", False))

                if 'helpful_vote' in record:
                    try:
                        record['helpful_vote'] = int(record['helpful_vote'])
                    except (ValueError, TypeError):
                        record['helpful_vote'] = 0
                else:
                    record['helpful_vote'] = 0

                preprocessed_data.append(record)

            except json.JSONDecodeError:
                print(f"Skipped a malformed line: {line}")

    file_name = os.path.basename(file_path)
    topic = file_name.replace("reviews_", "").replace("initial-loads_", "").replace(".jsonl", "")
    new_file_name = f"reviews_{topic}.jsonl"
    new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)

    with open(new_file_path, 'w', encoding='utf-8') as f:
        for record in preprocessed_data:
            json.dump(record, f)
            f.write('\n')

    print(f"Preprocessed file saved as: {new_file_path}")

for file_name in os.listdir(folder):
    if file_name.endswith(".jsonl") and file_name.startswith("initial-loads_"):
        file_path = os.path.join(folder, file_name)
        preprocess_json(file_path)


Preprocessed file saved as: C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data\reviews_Appliances.jsonl
Preprocessed file saved as: C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data\reviews_Automotive.jsonl
Preprocessed file saved as: C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data\reviews_Books.jsonl
Preprocessed file saved as: C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data\reviews_Digital_Music.jsonl
Preprocessed file saved as: C:\Users\vingu\OneDrive - The University of Texas at Austin\UT Documents\CS329E-Product-Project\AmazonProductReviews\project3\local_data\reviews_Gift_Cards.jsonl
Preprocessed file saved as: C:\Users\vingu\OneDrive 