<b><h1>Loading Sample Dataset</h1></b>

In [1]:
import json
import re
import string

sampled_dataset_filename = '/home/hdoop/Documents/BD_A3/sample_1gb_dataset.json'

# Function to load the sampled dataset
def load_sampled_dataset(filename):
    dataset = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            # Parse each line as JSON and append to the dataset list
            dataset.append(json.loads(line))
    return dataset

sampled_dataset = load_sampled_dataset(sampled_dataset_filename)

<b><h2>PreProcessing</h2></b>

In [2]:
# function to handle html tags
def remove_html_tags(text):
    tag_re = re.compile(r'(<[^>]*>)|(&[^;]+;)')
    return tag_re.sub('', text) if text else text

# function to handle punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator) if text else text

# function to convert to float if needed
def convert_to_float(value):
    try:
        return float(value.replace('$', '').strip()) if value else None
    except (ValueError, TypeError):
        return None
# function to clean and format the dataset
def preprocess_dataset(dataset):
    cleaned_dataset = []
    for entry in dataset:
        # clean all string fields by removing HTML tags and punctuation
        for key, value in entry.items():
            if isinstance(value, str):
                cleaned_text = remove_html_tags(value)
                cleaned_text = remove_punctuation(cleaned_text)
                entry[key] = cleaned_text
        # convert price if it is a string
        if 'price' in entry and isinstance(entry['price'], str):
            entry['price'] = convert_to_float(entry['price'])
        # exclude entries with HTML tags containing links
        if entry.get('asin') and entry.get('title') and not any('<a href=' in feature for feature in entry.get('feature', [])):
            cleaned_record = {
                "asin": entry.get("asin", ""),
                "title": entry.get("title", ""),
                "feature": entry.get("feature", []),
                "description": entry.get("description", ""),
                "price": entry.get("price"),
                "imageURL": entry.get("imageURL", ""),
                "brand": entry.get("brand", ""),
            }
            cleaned_dataset.append(cleaned_record)
    return cleaned_dataset


preprocessed_dataset = preprocess_dataset(sampled_dataset)

In [3]:
# Print the first few records of the preprocessed dataset
for record in preprocessed_dataset[:5]:
    print(record)

{'asin': '9543894027', 'title': 'Blue Simulated Sapphire Zirconia Austrian Crystals Round Set Pendant Necklace 18 Earrings Bracelet 18 ct Gold Plated', 'feature': ['Comes presented in a beautiful gift box with certificate, perfect for gifting or personal keeping. Beautiful and modern design suitable for ladies of any age who like the luxury.', 'Size: see detail photo. Length of the chain of the necklace 40cm+5cm (17.70 "). Lenght of the bracelet: 6.6 " Width: 0.31 "', 'Color: blue', 'Made of ideal cut Zirconia, this jewelry shines with grace and beauty. The metal surface is of high polished finish.', 'The crystals of this jewelry are not precious or semi-precious gemstones. They are simulated gemstones - imitation. Metal Type: gold plated base metal, nickel free.'], 'description': ['Charming set, encrusted with shining zirconia. Perfect quality production, beautiful and modern design suitable for ladies of any age who like the luxury. This stunning 18 carat gold plated set with high qu

<h3>Saving Dataset</h3>

In [4]:
def save_preprocessed_dataset(dataset, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

output_filename = '/home/hdoop/Documents/BD_A3/preprocessed_dataset.json'
# save the preprocessed dataset to a file
save_preprocessed_dataset(preprocessed_dataset, output_filename)