In [6]:
from pathlib import Path
import re
import json
import os
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [22]:
def check_json_files(base_dir):
    """Process JSON files and convert them into a structured format."""
    records = []  

    # Walk through the directory structure
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        
                        data = json.load(f)
                        
                        # Safely extract fields, handling missing keys
                        record = {
                            "title": data.get("title", ""), 
                            "category": data.get("category", ""),
                            "published_date": data.get("published_date", ""),
                            "sections": data.get("sections", []) 
                        }

                        records.append(record)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file: {file_path}")
    
    return records

In [23]:
records = check_json_files("articles/raw")
df = pd.DataFrame(records)

In [24]:
pprint(records[0], indent=4,width=100)

{   'category': 'Expert review',
    'published_date': '19 December 2014',
    'sections': [   {   'content': 'The standard Fiat 500 has cute retro appeal but the hot Abarth '
                                   '500 version makes it even more desirable. An aggressive body '
                                   'kit, go-faster stripes and cool alloy wheels add a dose of '
                                   'masculinity to help it stand out from the crowd. The iconic '
                                   'scorpion badges hark back to Fiat’s previous hot Abarth cars, '
                                   'while the two-tone red and white colour schemes look the '
                                   'business.',
                        'expert_rating': 'Expert rating: 4/5',
                        'section_title': 'How good does it look?'},
                    {   'content': 'The Abarth 500 is very similar to the standard Fiat 500 '
                                   'inside, with a high, dash-

In [25]:
df.shape

(1554, 4)

In [26]:
df["category"].value_counts()

category
Expert review       1439
Long-term review     115
Name: count, dtype: int64

### Extract car details from reviews

In [27]:
def parse_car_filename(filename, base_directory=None):
    """Parse car review filename into structured data
    
    Args:
        filename (str): The filename or full path to parse
        base_directory (str): Optional base directory to identify review type
    
    Returns:
        dict: Parsed car details including make, model, and optionally body_type and year
    """
    if isinstance(filename, Path):
        filename = str(filename)
    
    # Clean the input path to just the filename
    filename = filename.replace("\\", "/").split("/")[-1]

    # Remove .json extension and review-type suffixes
    base = filename.replace(".json", "")
    base = base.replace("-expert-review", "").replace("-long-term-test-review", "")

    # Handle "living with" format for long-term reviews
    if base.startswith("living-with-a-"):
        base = base.replace("living-with-a-", "", 1)
    elif base.startswith("living-with-an-"):
        base = base.replace("living-with-an-", "", 1)

    # General car makes
    make_types = [
        "abart", "abarth", "alpine", "ariel", "audi", "bmw", "byd", "bentley",
        "bugatti", "cupra", "caterham", "chevrolet", "chrysler", "citroen",
        "ds", "dacia", "dodge", "ferrari", "fiat", "fisker", "ford", "genesis",
        "honda", "hyundai", "ineos", "infiniti", "isuzu", "jaguar", "jeep",
        "kia", "lamborghini", "leapmotor", "lexus", "lotus", "maserati",
        "mazda", "mclaren", "mercedes", "mg", "micro", "mini", "mitsubishi",
        "nio", "nissan", "omoda", "perodua", "peugeot", "polestar", "porsche",
        "proton", "renault", "saab", "seat", "skoda", "skywell", "smart",
        "ssangyong", "subaru", "suzuki", "tesla", "toyota", "vauxhall",
        "volkswagen", "volvo", "xpeng", "zeekr"
    ]

    # Multi-word car makes
    make_types_w_ = [
        "mercedes-benz", "alfa-romeo", "aston-martin", "ds-automobiles",
        "gwm-ora", "land-rover", "range-rover", "rolls-royce"
    ]

    # Split remaining parts
    parts = base.split("-")

    # Attempt to extract make
    make = None
    # First try multi-word makes
    for length in range(2, 0, -1):
        if len(parts) >= length:
            candidate = "-".join(parts[:length]).lower()
            if candidate in make_types_w_:
                make = candidate
                parts = parts[length:]
                break

    # If no multi-word make found, try single word makes
    if make is None and parts:
        candidate = parts[0].lower()
        if candidate in make_types:
            make = candidate
            parts = parts[1:]

    # Words to exclude from model name
    exclude_words = [
        "review", "reviews", "test", "drive", "preview", "long", "term",
        "final", "report", "second", "third", "fourth", "fifth", "first",
        "edition", "vignale", "expert"
    ]

    # Find model parts
    model_parts = []
    for part in parts:
        part_lower = part.lower()
        if part_lower not in exclude_words:
            model_parts.append(part)

    # Join model parts
    model = "-".join(model_parts).lower() if model_parts else None

    # For long term reviews directory, return simplified structure
    if base_directory and base_directory.strip("/") == "articles/raw/long_term_reviews":
        return {"make": make, "model": model}

    # Extract additional metadata for expert reviews
    year_match = re.search(r"-(\d{4})(?:-|$)", base)
    year = year_match.group(1) if year_match else None

    # Common body types
    body_types = [
        "hatchback", "estate", "saloon", "suv", "coupe", "convertible",
        "mpv", "pickup", "4x4", "hybrid", "electric", "hatch", "sport"
    ]

    # Find body type components
    body_type_parts = []
    for part in parts:
        if part.lower() in body_types:
            body_type_parts.append(part.lower())

    body_type = "-".join(body_type_parts) if body_type_parts else None

    return {
        "make": make,
        "model": model,
        "body_type": body_type,
        "year": year
    }


In [28]:
def process_review_file(file_path, base_directory):
    """Process a single review file and add extracted metadata"""
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Get the filename from the path
    filename = os.path.basename(file_path)

    # Extract metadata from filename
    metadata = parse_car_filename(filename, base_directory)

    # Add file path information
    data["car_details"] = {**metadata}

    # Write back the updated JSON
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    return True

In [29]:
base_directory_lr = "articles/raw/long_term_reviews"
base_directory_er = "articles/raw/expert_review"

In [30]:
base_path = Path(base_directory_er)
    
# Find all JSON files in the expert_review directory
json_files = list(base_path.glob('**/*.json'))


for file_path in tqdm(json_files, desc="Processing files"):
    process_review_file(file_path, base_directory_er)

Processing files: 100%|██████████| 1439/1439 [00:07<00:00, 182.83it/s]


In [32]:
file_path = json_files[0]
with open(file_path, "r", encoding="utf-8") as f:
        updated_data = json.load(f)
  
pprint(updated_data, indent=4, width=100)

{   'car_details': {   'body_type': 'hatchback',
                       'make': 'abart',
                       'model': '500-hatchback-2009',
                       'year': '2009'},
    'category': 'Expert review',
    'published_date': '19 December 2014',
    'sections': [   {   'content': 'The standard Fiat 500 has cute retro appeal but the hot Abarth '
                                   '500 version makes it even more desirable. An aggressive body '
                                   'kit, go-faster stripes and cool alloy wheels add a dose of '
                                   'masculinity to help it stand out from the crowd. The iconic '
                                   'scorpion badges hark back to Fiat’s previous hot Abarth cars, '
                                   'while the two-tone red and white colour schemes look the '
                                   'business.',
                        'expert_rating': 'Expert rating: 4/5',
                        'section_title': 