In [2]:
from pathlib import Path
import re
import json
import os
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [3]:
def check_json_files(base_dir):
    """Process JSON files and convert them into a structured format."""
    records = []  

    # Walk through the directory structure
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        
                        data = json.load(f)
                        
                        # Safely extract fields, handling missing keys
                        record = {
                            "title": data.get("title", ""), 
                            "category": data.get("category", ""),
                            "published_date": data.get("published_date", ""),
                            "sections": data.get("sections", []) 
                        }

                        records.append(record)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file: {file_path}")
    
    return records

In [4]:
records = check_json_files("articles/raw")
df = pd.DataFrame(records)

In [6]:
pprint(records[0], indent=4,width=100)

{   'category': 'Expert review',
    'published_date': '19 December 2014',
    'sections': [   {   'content': 'The standard Fiat 500 has cute retro appeal but the hot Abarth '
                                   '500 version makes it even more desirable. An aggressive body '
                                   'kit, go-faster stripes and cool alloy wheels add a dose of '
                                   'masculinity to help it stand out from the crowd. The iconic '
                                   'scorpion badges hark back to Fiat’s previous hot Abarth cars, '
                                   'while the two-tone red and white colour schemes look the '
                                   'business.',
                        'expert_rating': 'Expert rating: 4/5',
                        'section_title': 'How good does it look?'},
                    {   'content': 'The Abarth 500 is very similar to the standard Fiat 500 '
                                   'inside, with a high, dash-

In [7]:
df.shape

(1554, 4)

In [8]:
df["category"].value_counts()

category
Expert review       1439
Long-term review     115
Name: count, dtype: int64

### Extract car details from reviews

In [9]:
def parse_car_filename(filename, base_directory=None):
    """Parse car review filename into structured data
    
    Args:
        filename (str): The filename or full path to parse
        base_directory (str): Optional base directory to identify review type
    
    Returns:
        dict: Parsed car details including make, model, and optionally body_type and year
    """
    if isinstance(filename, Path):
        filename = str(filename)
    
    # Clean the input path to just the filename
    filename = filename.replace("\\", "/").split("/")[-1]

    # Remove .json extension and review-type suffixes
    base = filename.replace(".json", "")
    base = base.replace("-expert-review", "").replace("-long-term-test-review", "")

    # Handle "living with" format for long-term reviews
    if base.startswith("living-with-a-"):
        base = base.replace("living-with-a-", "", 1)
    elif base.startswith("living-with-an-"):
        base = base.replace("living-with-an-", "", 1)

    # General car makes
    make_types = [
        "abart", "abarth", "alpine", "ariel", "audi", "bmw", "byd", "bentley",
        "bugatti", "cupra", "caterham", "chevrolet", "chrysler", "citroen",
        "ds", "dacia", "dodge", "ferrari", "fiat", "fisker", "ford", "genesis",
        "honda", "hyundai", "ineos", "infiniti", "isuzu", "jaguar", "jeep",
        "kia", "lamborghini", "leapmotor", "lexus", "lotus", "maserati",
        "mazda", "mclaren", "mercedes", "mg", "micro", "mini", "mitsubishi",
        "nio", "nissan", "omoda", "perodua", "peugeot", "polestar", "porsche",
        "proton", "renault", "saab", "seat", "skoda", "skywell", "smart",
        "ssangyong", "subaru", "suzuki", "tesla", "toyota", "vauxhall",
        "volkswagen", "volvo", "xpeng", "zeekr"
    ]

    # Multi-word car makes
    make_types_w_ = [
        "mercedes-benz", "alfa-romeo", "aston-martin", "ds-automobiles",
        "gwm-ora", "land-rover", "range-rover", "rolls-royce"
    ]

    # Split remaining parts
    parts = base.split("-")

    # Attempt to extract make
    make = None
    # First try multi-word makes
    for length in range(2, 0, -1):
        if len(parts) >= length:
            candidate = "-".join(parts[:length]).lower()
            if candidate in make_types_w_:
                make = candidate
                parts = parts[length:]
                break

    # If no multi-word make found, try single word makes
    if make is None and parts:
        candidate = parts[0].lower()
        if candidate in make_types:
            make = candidate
            parts = parts[1:]

    # Words to exclude from model name
    exclude_words = [
        "review", "reviews", "test", "drive", "preview", "long", "term",
        "final", "report", "second", "third", "fourth", "fifth", "first",
        "edition", "vignale", "expert"
    ]

    # Find model parts
    model_parts = []
    for part in parts:
        part_lower = part.lower()
        if part_lower not in exclude_words:
            model_parts.append(part)

    # Join model parts
    model = "-".join(model_parts).lower() if model_parts else None

    # For long term reviews directory, return simplified structure
    if base_directory and base_directory.strip("/") == "articles/raw/long_term_reviews":
        return {"make": make, "model": model}

    # Extract additional metadata for expert reviews
    year_match = re.search(r"-(\d{4})(?:-|$)", base)
    year = year_match.group(1) if year_match else None

    # Common body types
    body_types = [
        "hatchback", "estate", "saloon", "suv", "coupe", "convertible",
        "mpv", "pickup", "4x4", "hybrid", "electric", "hatch", "sport"
    ]

    # Find body type components
    body_type_parts = []
    for part in parts:
        if part.lower() in body_types:
            body_type_parts.append(part.lower())

    body_type = "-".join(body_type_parts) if body_type_parts else None

    return {
        "make": make,
        "model": model,
        "body_type": body_type,
        "year": year
    }


In [11]:
def process_review_file(file_path, base_directory):
    """Process a single review file and add extracted metadata"""
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Get the filename from the path
    filename = os.path.basename(file_path)

    # Extract metadata from filename
    metadata = parse_car_filename(filename, base_directory)

    # Add file path information
    data["car_details"] = {**metadata}

    # Write back the updated JSON
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    return True

In [12]:
base_directory_lr = "articles/raw/long_term_reviews"
base_directory_er = "articles/raw/expert_review"

In [13]:
base_path = Path(base_directory_er)
    
# Find all JSON files in the expert_review directory
json_files = list(base_path.glob('**/*.json'))


for file_path in tqdm(json_files, desc="Processing Expert Files"):
    process_review_file(file_path, base_directory_er)

Processing Expert Files: 100%|██████████| 1439/1439 [00:18<00:00, 77.31it/s] 


In [14]:
#Expert review files
file_path = json_files[0]
with open(file_path, "r", encoding="utf-8") as f:
        updated_data = json.load(f)
  
pprint(updated_data, indent=4, width=100)

{   'car_details': {   'body_type': 'hatchback',
                       'make': 'abart',
                       'model': '500-hatchback-2009',
                       'year': '2009'},
    'category': 'Expert review',
    'published_date': '19 December 2014',
    'sections': [   {   'content': 'The standard Fiat 500 has cute retro appeal but the hot Abarth '
                                   '500 version makes it even more desirable. An aggressive body '
                                   'kit, go-faster stripes and cool alloy wheels add a dose of '
                                   'masculinity to help it stand out from the crowd. The iconic '
                                   'scorpion badges hark back to Fiat’s previous hot Abarth cars, '
                                   'while the two-tone red and white colour schemes look the '
                                   'business.',
                        'expert_rating': 'Expert rating: 4/5',
                        'section_title': 

In [15]:
#Long-term review files
base_path_lr = Path(base_directory_lr)
    
# Find all JSON files in the expert_review directory
json_files_lr = list(base_path_lr.glob('**/*.json'))


for file_path in tqdm(json_files_lr, desc="Processing Long-term files"):
    process_review_file(file_path, base_directory_lr)

Processing Long-term files:  11%|█▏        | 13/115 [00:00<00:00, 117.80it/s]

Processing Long-term files: 100%|██████████| 115/115 [00:02<00:00, 49.65it/s]


In [16]:
#Long-term expert review files
file_path = json_files_lr[0]
with open(file_path, "r", encoding="utf-8") as f:
        updated_data = json.load(f)
  
pprint(updated_data, indent=4, width=100)

{   'car_details': {'make': 'audi', 'model': 'a3-e-tron'},
    'category': 'Long-term review',
    'published_date': None,
    'sections': [   {   'content': 'Mileage: 9,774\n'
                                   '\n'
                                   'Costs: £220First report: May 2015Second report: June 2015Third '
                                   'report: July 2015 Plug-in hybrids and city living, are they a '
                                   'match made in heaven? Here at Auto Trader we tried to find out '
                                   'by running the award-winning A3 e-tron Sportback for a '
                                   'three-month period.In that time, as well as subjecting the car '
                                   'to London life, we managed to go to Le Mans to see Audi racing '
                                   'its hybrids against the might of Porsche, Toyota and Nissan, '
                                   'drive the length and breadth of the country on road

### File Text Statistics

In [18]:
def compute_statistics(text):
    """Compute statistics for a given text."""
    num_characters = len(text)
    num_words = len(text.split(" "))
    num_sentences = len(text.split(". "))
    num_tokens = num_characters // 4
    return {
        "num_sentences": num_sentences,
        "num_words": num_words,
        "num_characters": num_characters,
        "num_tokens": num_tokens
    }

def process_category(category_path):
    """Process files in a category and compute statistics."""
    file_stats = []
    for root, _, files in os.walk(category_path):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    
                    data = json.load(f)
                    text = json.dumps(data) if isinstance(data, dict) else str(data)
                    stats = compute_statistics(text)
                    stats["file_name"] = file
                    file_stats.append(stats)
    
    df = pd.DataFrame(file_stats)
    
    return df

In [19]:
# Expert review statistics
df_er = process_category(base_directory_er)

In [23]:
df_er.sample(5)

Unnamed: 0,num_sentences,num_words,num_characters,num_tokens,file_name
1362,29,1092,7100,1775,volkswagen-golf-sv-review-hatchback-2018.json
953,26,830,5531,1382,peugeot-207-cc-convertible-2007-expert-review....
952,31,1134,7370,1842,peugeot-2008-review-suv-2019.json
1129,50,1430,9042,2260,seat-tarraco-review-SUV-2018.json
30,29,997,6277,1569,aston-martin-db12-review.json


In [21]:
df_er.describe()

Unnamed: 0,num_sentences,num_words,num_characters,num_tokens
count,1439.0,1439.0,1439.0,1439.0
mean,32.313412,1114.377345,7129.875608,1782.086171
std,8.615236,262.660836,1574.51648,393.627293
min,2.0,95.0,724.0,181.0
25%,26.0,939.0,6038.0,1509.0
50%,31.0,1076.0,6881.0,1720.0
75%,37.0,1264.5,8030.5,2007.5
max,63.0,2168.0,13289.0,3322.0


In [22]:
# Long-term expert review statistics
df_lr = process_category(base_directory_lr)

In [24]:
df_lr.sample(5)

Unnamed: 0,num_sentences,num_words,num_characters,num_tokens,file_name
9,25,1046,6301,1575,audi-tt-s-roadster-long-term-test-review-secon...
42,102,3352,19780,4945,living-with-a-ds-automobiles-ds-4.json
0,14,867,5405,1351,audi-a3-e-tron-long-term-test-review-final-rep...
113,15,787,5146,1286,volvo-xc90-t8-inscription-long-term-test-revie...
44,55,1517,9391,2347,living-with-a-ford-focus-active-x-estate.json


In [25]:
df_lr.describe()

Unnamed: 0,num_sentences,num_words,num_characters,num_tokens
count,115.0,115.0,115.0,115.0
mean,48.826087,1630.695652,9909.956522,2477.113043
std,43.766177,1208.040556,7088.369575,1772.074291
min,8.0,509.0,3231.0,807.0
25%,17.0,765.5,4925.0,1231.0
50%,25.0,1016.0,6301.0,1575.0
75%,75.5,2396.0,14229.5,3557.0
max,264.0,8451.0,49616.0,12404.0
