In [2]:
import json
import glob
from pathlib import Path
import pandas as pd
from typing import Dict, List, Optional

def load_gutenberg_dataset(filepath: str) -> pd.DataFrame:
    """
    Load and process the Gutenberg dataset from a JSON file.
    
    Args:
        filepath: Path to the Gutenberg dataset JSON file
    
    Returns:
        DataFrame containing the Gutenberg dataset
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            gutenberg_data = json.load(f)
        
        # Convert to DataFrame and ensure it's in the right format
        df = pd.DataFrame(gutenberg_data)
        
        # Print detailed information about the dataset
        print("Gutenberg dataset info:")
        print(df.info())
        print("\nGutenberg dataset columns:", df.columns.tolist())
        print(f"\nNumber of records in Gutenberg dataset: {len(df)}")
        
        return df
    except Exception as e:
        print(f"Error loading Gutenberg dataset: {str(e)}")
        raise

def load_amazon_chunks(chunks_directory: str) -> pd.DataFrame:
    """
    Load and concatenate all Amazon dataset chunks from a directory.
    
    Args:
        chunks_directory: Directory containing Amazon dataset chunk files
    
    Returns:
        DataFrame containing the combined Amazon dataset
    """
    try:
        # Get all JSON files in the directory
        chunk_files = glob.glob(str(Path(chunks_directory) / "amazon_preprocessed_chunk_*.json"))
        
        if not chunk_files:
            raise FileNotFoundError(f"No Amazon chunk files found in {chunks_directory}")
        
        all_chunks = []
        for chunk_file in chunk_files:
            with open(chunk_file, 'r', encoding='utf-8') as f:
                chunk_data = json.load(f)
                # Verify the structure of each chunk
                print(f"\nProcessing chunk file: {chunk_file}")
                if isinstance(chunk_data, dict):
                    print(f"Keys in chunk file: {chunk_data.keys()}")
                    chunk_data = chunk_data.get('books', chunk_data)
                
                # Verify categories exist in the chunk data
                if isinstance(chunk_data, list) and chunk_data:
                    sample_book = chunk_data[0]
                    print(f"Sample book keys: {sample_book.keys()}")
                    if 'categories' in sample_book:
                        print(f"Sample categories: {sample_book['categories']}")
                
                all_chunks.extend(chunk_data if isinstance(chunk_data, list) else [chunk_data])
        
        df = pd.DataFrame(all_chunks)
        
        # Print detailed information about the Amazon dataset
        print("\nAmazon dataset info:")
        print(df.info())
        print("\nAmazon dataset columns:", df.columns.tolist())
        print(f"\nNumber of records in Amazon dataset: {len(df)}")
        
        # Verify categories column
        if 'categories' in df.columns:
            print("\nSample of categories from Amazon dataset:")
            print(df['categories'].head())
            print(f"\nNumber of books with categories: {df['categories'].notna().sum()}")
        else:
            print("\nWARNING: 'categories' column not found in Amazon dataset!")
        
        return df
    except Exception as e:
        print(f"Error loading Amazon chunks: {str(e)}")
        raise

def merge_datasets(gutenberg_df: pd.DataFrame, amazon_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge Gutenberg and Amazon datasets based on matching titles, including categories.
    
    Args:
        gutenberg_df: DataFrame containing Gutenberg dataset
        amazon_df: DataFrame containing Amazon dataset
    
    Returns:
        Merged DataFrame with additional Amazon fields including categories
    """
    # Verify input data
    print("\nPre-merge verification:")
    print(f"Gutenberg dataset shape: {gutenberg_df.shape}")
    print(f"Amazon dataset shape: {amazon_df.shape}")
    print("\nAmazon dataset columns before merge:", amazon_df.columns.tolist())
    
    # Clean titles in both datasets
    gutenberg_df = clean_titles(gutenberg_df)
    amazon_df = clean_titles(amazon_df)
    
    # Verify cleaned titles
    print("\nSample of cleaned titles from both datasets:")
    print("\nGutenberg cleaned titles:")
    print(gutenberg_df[['title', 'cleaned_title']].head())
    print("\nAmazon cleaned titles:")
    print(amazon_df[['title', 'cleaned_title']].head())
    
    # Store the columns we want to merge
    amazon_columns = ['cleaned_title', 'subtitle', 'publisher', 'ISBN 10', 'ISBN 13', 'main_image_url', 'categories']
    print("\nVerifying Amazon columns before merge:")
    for col in amazon_columns:
        if col in amazon_df.columns:
            print(f"Column '{col}' exists in Amazon dataset")
        else:
            print(f"WARNING: Column '{col}' missing from Amazon dataset!")
    
    # Perform the merge
    merged_df = pd.merge(
        gutenberg_df,
        amazon_df[amazon_columns],
        on='cleaned_title',
        how='left'
    )
    
    # Verify merge results
    print("\nPost-merge verification:")
    print(f"Merged dataset shape: {merged_df.shape}")
    print("\nMerged dataset columns:", merged_df.columns.tolist())
    
    # Check categories specifically
    if 'categories' in merged_df.columns:
        print("\nCategories in merged dataset:")
        print(f"Number of books with categories: {merged_df['categories'].notna().sum()}")
        print("\nSample of books with categories:")
        sample = merged_df[merged_df['categories'].notna()].head(3)
        for _, row in sample.iterrows():
            print(f"\nTitle: {row['title']}")
            print(f"Categories: {row['categories']}")
    else:
        print("\nWARNING: 'categories' column not present in merged dataset!")
    
    # Remove the cleaning column
    merged_df = merged_df.drop('cleaned_title', axis=1)
    
    return merged_df

def clean_titles(df: pd.DataFrame, title_column: str = 'title') -> pd.DataFrame:
    """[Previous implementation remains the same]"""
    df = df.copy()
    if title_column not in df.columns:
        available_columns = df.columns.tolist()
        raise KeyError(f"Column '{title_column}' not found. Available columns are: {available_columns}")
    
    df['cleaned_title'] = df[title_column].str.lower()
    df['cleaned_title'] = (df['cleaned_title']
                          .str.replace(r'[^\w\s]', ' ', regex=True)
                          .str.replace(r'\s+', ' ', regex=True)
                          .str.strip())
    return df

def main(gutenberg_path: str, amazon_chunks_dir: str, output_path: str):
    """
    Main function to orchestrate the dataset merging process.
    
    Args:
        gutenberg_path: Path to the Gutenberg dataset JSON file
        amazon_chunks_dir: Directory containing Amazon dataset chunks
        output_path: Path where the merged dataset should be saved
    """
    # Load datasets
    print("Loading Gutenberg dataset...")
    gutenberg_df = load_gutenberg_dataset(gutenberg_path)
    
    print("\nLoading Amazon dataset chunks...")
    amazon_df = load_amazon_chunks(amazon_chunks_dir)
    
    # Perform the merge
    print("\nMerging datasets...")
    merged_df = merge_datasets(gutenberg_df, amazon_df)
    
    # Verify final dataset before saving
    print("\nFinal dataset verification:")
    print("\nColumns in final dataset:", merged_df.columns.tolist())
    if 'categories' in merged_df.columns:
        categories_sample = merged_df[merged_df['categories'].notna()].head(3)
        print("\nSample of categories in final dataset:")
        for _, row in categories_sample.iterrows():
            print(f"\nTitle: {row['title']}")
            print(f"Categories: {row['categories']}")
    
    # Save the result
    print("\nSaving merged dataset...")
    merged_df.to_json(output_path, orient='records', indent=4)
    
    # Verify saved file
    print("\nVerifying saved file...")
    try:
        with open(output_path, 'r', encoding='utf-8') as f:
            saved_data = json.load(f)
        sample_book = saved_data[0]
        print("\nSample book from saved file:")
        print(f"Available fields: {list(sample_book.keys())}")
        if 'categories' in sample_book:
            print(f"Categories: {sample_book.get('categories')}")
        else:
            print("WARNING: 'categories' not found in saved data!")
    except Exception as e:
        print(f"Error verifying saved file: {str(e)}")
    
    print(f"\nMerge complete! Results saved to {output_path}")
    print(f"Successfully matched {merged_df['ISBN 10'].notna().sum()} out of {len(merged_df)} books")

if __name__ == "__main__":
    main(
        gutenberg_path="datasets/preprocessed_gutenberg.json",
        amazon_chunks_dir="preprocessed_chunks",
        output_path="merged_dataset.json"
    )


Loading Gutenberg dataset...
Gutenberg dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74594 entries, 0 to 74593
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        74594 non-null  object
 1   authors      74594 non-null  object
 2   issued       74594 non-null  object
 3   language     74594 non-null  object
 4   subjects     74594 non-null  object
 5   bookshelves  74594 non-null  object
dtypes: object(6)
memory usage: 3.4+ MB
None

Gutenberg dataset columns: ['title', 'authors', 'issued', 'language', 'subjects', 'bookshelves']

Number of records in Gutenberg dataset: 74594

Loading Amazon dataset chunks...

Processing chunk file: preprocessed_chunks\amazon_preprocessed_chunk_0000.json
Sample book keys: dict_keys(['title', 'subtitle', 'author_name', 'author_avatar', 'author_about', 'categories', 'publisher', 'ISBN 10', 'ISBN 13', 'main_image_url'])
Sample categories: ['Books', 'Literature 