In [4]:
import os
import pandas as pd
import json
import traceback

# Define the input and output directories
input_dir = "datasets/original_chunks/"
output_dir = "preprocessed_chunks/"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Helper functions to safely handle problematic fields
def safe_join(x):
    if isinstance(x, list):
        return " ".join(str(item) for item in x)
    if isinstance(x, float) and pd.isna(x):
        return None
    return str(x)

def safe_split(x):
    if isinstance(x, str):
        return x.split()
    if isinstance(x, float) and pd.isna(x):
        return []
    return []

def extract_field(details, field_name):
    """Generic function to extract fields from nested details"""
    if isinstance(details, list):
        for item in details:
            if isinstance(item, dict) and field_name in item:
                return item[field_name]
    elif isinstance(details, dict):
        if field_name in details:
            return details[field_name]
    return None

def safe_process_chunk(chunk_file):
    print(f"Processing {chunk_file}...")
    input_file_path = os.path.join(input_dir, chunk_file)
    output_file_name = f"amazon_preprocessed_{os.path.splitext(chunk_file)[0]}.json"
    output_file_path = os.path.join(output_dir, output_file_name)
    
    # Step 1: Import the dataset
    print("Reading JSON file...")
    try:
        df = pd.read_json(input_file_path, lines=True)
    except pd.errors.EmptyDataError:
        print(f"File {chunk_file} is empty, creating empty json file")
        result = []
        print(f"Saving to {output_file_path}...")
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=4)
        return False
    
    print(f"Initial dataframe shape: {df.shape}")
    
    try:
        # Step 2: Process nested fields first
        print("Processing nested fields...")
        # Handle author field
        if 'author' in df.columns:
            print("Processing author field...")
            df['author_name'] = df['author'].apply(lambda x: x.get('name') if isinstance(x, dict) else None)
            df['author_avatar'] = df['author'].apply(lambda x: x.get('avatar') if isinstance(x, dict) else None)
            df['author_about'] = df['author'].apply(lambda x: safe_join(x.get('about', [])) if isinstance(x, dict) else None)

        # Handle images field
        if 'images' in df.columns:
            print("Processing images field...")
            df['main_image_url'] = df['images'].apply(
                lambda imgs: next((img.get('large') for img in (imgs if isinstance(imgs, list) else [])
                                    if isinstance(img, dict) and img.get('variant') == 'MAIN'), None)
            )
            
        # Extract ISBNs and Publisher from 'details'
        if 'details' in df.columns:
            print("Extracting ISBNs and Publisher from details...")
            df['ISBN 10'] = df['details'].apply(lambda x: extract_field(x, 'ISBN-10') or extract_field(x, 'ISBN10') or extract_field(x, 'ISBN 10'))
            df['ISBN 13'] = df['details'].apply(lambda x: extract_field(x, 'ISBN-13') or extract_field(x, 'ISBN13') or extract_field(x, 'ISBN 13'))
            df['publisher'] = df['details'].apply(lambda x: extract_field(x, 'Publisher'))
            df = df.drop(columns=['details'])

        # Step 3: Filter for Books category
        print("Filtering for Books category...")
        if 'main_category' in df.columns:
            books_mask = df['main_category'].fillna('').astype(str).eq('Books')
            df_books = df[books_mask].copy()
            print(f"After Books filtering: {df_books.shape}")
            if df_books.empty:
                print("No books found in this chunk, creating empty json file")
                result = []
                print(f"Saving to {output_file_path}...")
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False, indent=4)
                return True
        else:
            print("main_category column not found, skipping filtering and keeping original dataframe.")
            df_books = df.copy()

        # Step 4: Select and rename columns
        print("Selecting final columns...")
        keep_columns = [
            'title', 'subtitle',
            'author_name', 'author_avatar', 'author_about',
            'categories',
            'publisher', 'ISBN 10', 'ISBN 13', 'main_image_url'
        ]
        existing_columns = [col for col in keep_columns if col in df_books.columns]
        df_books = df_books[existing_columns].copy()

        # Step 5: Drop rows with missing values in required fields
        print("Dropping rows with missing values...")
        required_fields = ['title', 'author_name', 'ISBN 10', 'ISBN 13']
        existing_required = [f for f in required_fields if f in df_books.columns]
        if existing_required:
            df_books = df_books.dropna(subset=existing_required).copy()
            print(f"After dropping missing values: {df_books.shape}")
        else:
            print("No required fields found in dataframe, skipping dropna")

        # Step 6: Restore structure
        print("Restoring final structure...")
        result = df_books.to_dict(orient='records')

        # Save the results
        print(f"Saving to {output_file_path}...")
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=4)
        
        print(f"Successfully processed {chunk_file}")
        return True

    except Exception as e:
        print(f"Error in chunk {chunk_file}:")
        print(traceback.format_exc())
        return False

# Main processing loop
chunk_files = [f for f in os.listdir(input_dir) if f.endswith('.jsonl')]
if not chunk_files:
    print(f"No JSONL files found in {input_dir}")
else:
    for chunk_file in chunk_files:
        safe_process_chunk(chunk_file)

print("Processing complete!")


Processing chunk_0000.jsonl...
Reading JSON file...
Initial dataframe shape: (100000, 16)
Processing nested fields...
Processing author field...
Processing images field...
Extracting ISBNs and Publisher from details...
Filtering for Books category...
After Books filtering: (90714, 22)
Selecting final columns...
Dropping rows with missing values...
After dropping missing values: (60236, 10)
Restoring final structure...
Saving to preprocessed_chunks/amazon_preprocessed_chunk_0000.json...
Successfully processed chunk_0000.jsonl
Processing chunk_0001.jsonl...
Reading JSON file...
Initial dataframe shape: (100000, 16)
Processing nested fields...
Processing author field...
Processing images field...
Extracting ISBNs and Publisher from details...
Filtering for Books category...
After Books filtering: (91200, 22)
Selecting final columns...
Dropping rows with missing values...
After dropping missing values: (58228, 10)
Restoring final structure...
Saving to preprocessed_chunks/amazon_preprocess