In [57]:
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapflyError
scrapfly = ScrapflyClient(key="scp-live-01fc8534987042f4a2fe553c5cf6a2df", max_concurrency=2)
url = 'https://www.target.com/p/-/A-24007088'
listing_result = scrapfly.scrape(ScrapeConfig(url=url, render_js=True, country="US", asp=True, retry=False, rendering_wait=10000))
listing_html = listing_result.scrape_result['content']

In [58]:
input_dir = r"C:\Users\SudheerRChinthala\circana\Srikanth\Data"
input_file = os.path.join(input_dir, "TargetListing.html")
with open(input_file, 'w', encoding='utf-8') as f:
    f.write(listing_html)

In [59]:

def format_product_name(name, max_length=30):
    if not name:
        name = "unknown_product"
    # Replace spaces and special characters with underscores
    #return re.sub(r'[^a-zA-Z0-9]+', '_', name)
    formatted_name = re.sub(r'[^a-zA-Z0-9]+', '_', name)
    return formatted_name[:max_length]

def sanitize_unescaped_quotes_and_load_json_str(s: str, strict=False) -> dict:  # type: ignore
    # TODO: one thing this doesn't handle, is if the unescaped text includes valid JSON - then you're just out of luck
    js_str = s
    prev_pos = -1
    curr_pos = 0
    while curr_pos > prev_pos:
        # after while check, move marker before we overwrite it
        prev_pos = curr_pos
        try:
            return json.loads(js_str, strict=strict)
        except json.JSONDecodeError as err:
            curr_pos = err.pos
            if curr_pos <= prev_pos:
                # previous change didn't make progress, so error
                raise err

            # find the previous " before e.pos
            prev_quote_index = js_str.rfind('"', 0, curr_pos)
            # escape it to \"
            js_str = js_str[:prev_quote_index] + "\\" + js_str[prev_quote_index:]
            
def transform_json(input_data):
    output_data = {}
    missing_keys = []

    def get_value(dct, path, key_name=None):
        try:
            value = dct
            for p in path:
                value = value[p]
            if value is None:
                missing_keys.append(key_name or path[-1])
            return value
        except (KeyError, TypeError, IndexError):
            missing_keys.append(key_name or path[-1])
            return None

    product_data = get_value(input_data, ["data", "product"], "product")
    output_data["UPC"] = get_value(product_data, ["item","primary_barcode"], "upc")
    output_data["keyCat"] = get_value(product_data, ["category", "name"], "keycat")
    #output_data["itemID"] = get_value(product_data, ["tcin"], "itemID")
    output_data["itemID"] = ""
    output_data["keyval"] = []
    #output_data["product_name"] = get_value(product_data, ["item", "product_description", "title"], "product_name")

    transformed_item = { 
        "productName": get_value(product_data, ["item", "product_description", "title"], "product_name"),
        "tcin": get_value(product_data, ["tcin"], "tcin"),
        "category": get_value(product_data, ["category", "name"], "Category"),
        "ppci": get_value(product_data, ["item", "dpci"], "dpci"),
        "buyUrl": get_value(product_data, ["item", "enrichment", "buy_url"], "buy_url")
    }

    # Extract image URLs
    content_labels = get_value(product_data, ["item", "enrichment", "images", "content_labels"], "content_labels")
    if isinstance(content_labels, list):
        for i in range(len(content_labels)):
            if i < 10:
                image_url_key = f"image_url{i + 1}"
                transformed_item[image_url_key] = get_value(content_labels, [i, "image_url"], image_url_key)

    # Extract 'value_prepared_list' details
    value_prepared_list = get_value(product_data, ["item", "enrichment", "nutrition_facts", "value_prepared_list"], "value_prepared_list")
    if isinstance(value_prepared_list, list) and len(value_prepared_list) > 0:
        first_entry = value_prepared_list[0]
        transformed_item["description"] = first_entry.get("description", None)
        transformed_item["servingSize"] = first_entry.get("serving_size", None)
        transformed_item["servingSizeUnitOfMeasurement"] = first_entry.get("serving_size_unit_of_measurement", None)
        transformed_item["servingsPerContainer"] = first_entry.get("servings_per_container", None)

        nutrients = first_entry.get("nutrients", [])
        for nutrient in nutrients:
            name = nutrient.get("name", "")
            if name:
                transformed_item[f"{name} quantity"] = nutrient.get("quantity", None)
                if "percentage" in nutrient:
                    transformed_item[f"{name} percentage"] = nutrient.get("percentage", None)
                transformed_item[f"{name} unit_of_measurement"] = nutrient.get("unit_of_measurement", None)

    # Extract 'merchandise_classification' details
    merchandise_classification = get_value(product_data, ["item", "merchandiseClassification"], "merchandiseClassification")
    if merchandise_classification:
        transformed_item["classId"] = merchandise_classification.get("class_id", None)
        transformed_item["departmentId"] = merchandise_classification.get("department_id", None)
        transformed_item["merchandiseClassificationDepartmentName"] = merchandise_classification.get("department_name", None)

    # Extract 'package_dimensions' details
    package_dimensions = get_value(product_data, ["item", "package_dimensions"], "package_dimensions")
    if package_dimensions:
        for key, value in package_dimensions.items():
            transformed_item[f"package_dimensions {key}"] = value

    # Extract 'primary_brand' and 'product_classification' details
    transformed_item["primaryBrand"] = get_value(product_data, ["item", "primary_brand", "name"], "primary_brand")
    transformed_item["productClassification"] = get_value(product_data, ["item", "product_classification", "product_type_name"], "product_classification")

    # Extract bullet descriptions
    bullet_descriptions = get_value(product_data, ["item", "product_description", "bullet_descriptions"], "bullet_descriptions")
    if isinstance(bullet_descriptions, list):
        for desc in bullet_descriptions:
            if ':' in desc:
                key_value_pair = desc.split(":")
                key = key_value_pair[0].replace("\u003cBu003e", "").replace("\u003c/Bu003e", "").strip()
                value = key_value_pair[1].strip() if len(key_value_pair) > 1 else None
                transformed_item[key] = value

    # Extract additional descriptions
    transformed_item["downstreamDescription"] = get_value(product_data, ["item", "product_description", "downstream_description"], "downstream_description")
    transformed_item["soft_bulletDescription"] = get_value(product_data, ["item", "product_description", "soft_bullet_description"], "soft_bullet_description")

    # Extract price details
    price_details = get_value(product_data, ["price"], "price")
    if price_details:
        for key, value in price_details.items():
            transformed_item[f"price {key}"] = value

    output_data["keyval"].append(transformed_item)
    return output_data
    
def extract_product_data(json_string):
    pattern = r'(\{"data":\{"product".*?"metadata":\{"status":200\}\})'
    match = re.search(pattern, json_string)
    
    if match:
        extracted_json_string = match.group(1)
        try:
            parsed_data = json.loads(extracted_json_string)          
            return parsed_data
        except json.JSONDecodeError:
            print(json.JSONDecodeError)
            return sanitize_unescaped_quotes_and_load_json_str(extracted_json_string)
    else:
        print("Error: The specified pattern was not found in the input string.")
        return None

In [60]:
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapflyError
from pathlib import Path
import html
import json
import json
import re
import os

def format_data(data):
    # If data is already a Python object, no need for eval
    if isinstance(data, (dict, list)):
        return data
    # If it's a string, then use eval (be cautious with eval though)
    elif isinstance(data, str):
        return eval(data)
    else:
        raise TypeError("data must be a string, dict, or list")

In [55]:
import jsonpath_ng as jp
import os
from bs4 import BeautifulSoup
import json
from pprint import pprint
import re
import pyparsing

comment_remover = pyparsing.cpp_style_comment.suppress()
comment_remover.ignore(pyparsing.QuotedString('"') | pyparsing.QuotedString("'"))

soup = BeautifulSoup(listing_html, 'html.parser')   
script_tags = soup.find_all('script')
def remove_objects(text):
    """
    replaces all `"key": object` ocurrances in text
    with `"key": {}`
    """
    text = comment_remover.transform_string(text)

    def _rm(match: re.Match):
        key, value, trail = match.groups()
        return key + "{}" + trail

    return re.sub(r'("[^"]+?"\s*:\s*)([^"\s[{\d(?:true|false)].+?)(,|$|})', _rm, text)


for script in script_tags:
    if script.string and 'Object.defineProperties(window,' in script.string:
        #js_code = remove_objects(comment_remover.transform_string(script.string))
        js_code = script.string
        preprocessed_data = re.sub(r'\\r\\n', '', js_code)  # Remove newline characters
        preprocessed_data = preprocessed_data.replace('\\', '')  # Remove escape characters
        preprocessed_data = preprocessed_data.split('__,__')[0]
        preprocessed_data = preprocessed_data.replace(';', '')
        preprocessed_data = re.sub(r'[^\x00-\x7F]+', '', preprocessed_data)
        start_index = preprocessed_data.find('__TGT_DATA__')
        
        if start_index != -1:
            json_start = preprocessed_data.find('JSON.parse("', start_index) + len('JSON.parse("')
            json_end = preprocessed_data.find('")', json_start)
            
            if json_start != -1 and json_end != -1:
                json_str = preprocessed_data[json_start:json_end]
                #cleaned_json_str = json_str.replace('\\"', '"').replace('\\n', '').replace('\\r', '')
                
                cleaned_json_str = json_str.replace('\\"', '"')
                cleaned_json_str = cleaned_json_str.replace('\\"', '"')
json_content = cleaned_json_str.replace('\\','')      
json_content = re.sub(r'u([0-9A-Fa-f]{4})', lambda x: chr(int(x.group(1), 16)), json_content)
json_content = re.sub(r'<.*?>', '', json_content)
json_content = html.unescape(json_content)
#json_content = json_content.replace('"', '\\"')
result = extract_product_data(json_content)
#print(result)
result = transform_json(result)

In [61]:
result

{'UPC': '048500201800',
 'keyCat': None,
 'itemID': '',
 'keyval': [{'productName': 'Starbucks Iced Pumpkin Spice Latte Espresso Beverage - 40 fl oz',
   'tcin': '24007088',
   'category': None,
   'ppci': '284-04-1021',
   'buyUrl': 'https://www.target.com/p/starbucks-iced-pumpkin-spice-latte-espresso-beverage-40-fl-oz/-/A-24007088',
   'image_url1': 'https://target.scene7.com/is/image/Target/GUEST_2e74c2ee-40dc-4b0e-82bc-50b0fe395937',
   'image_url2': 'https://target.scene7.com/is/image/Target/GUEST_7c5f43e0-d00e-40e0-a882-0501fa514356',
   'image_url3': 'https://target.scene7.com/is/image/Target/GUEST_3b81a926-e1dc-44da-a4f6-ec7f20493858',
   'image_url4': 'https://target.scene7.com/is/image/Target/GUEST_4eaf042c-3a29-43df-b2b1-b9fce13e0134',
   'image_url5': 'https://target.scene7.com/is/image/Target/GUEST_f8386a76-b0ff-450a-99c2-4e9e9fee668a',
   'description': 'Amount per serving',
   'servingSize': '12',
   'servingSizeUnitOfMeasurement': 'fl oz',
   'servingsPerContainer': 'ab