loading the data set (1 GB sample size )

In [1]:
import json
import re
sampled_dataset_filename= '/home/emad/sample_1gb_dataset.json'
# Function to load the sampled dataset
def load_sampled_dataset(filename):
    dataset= []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            # Parse each line as JSON and append to the dataset list
            dataset.append(json.loads(line))
    return dataset

sampled_dataset= load_sampled_dataset(sampled_dataset_filename)

print("Number of records in the sampled dataset:", len(sampled_dataset))


Number of records in the sampled dataset: 102562


pre processing with BATCH PROCESSING 

In [5]:
# function to handle html tags
def remove_html_tags(text):
    # regular expression for html tag
    tag_expression= re.compile(r'<[^>]+>')
    # replace all instances of HTML tags with an empty string
    return tag_expression.sub('', text) if text else text
# fucntion to convert to float if needed
def convert_to_float(value):
    try:
        return float(value.replace('$', '').strip()) if value else None
    except (ValueError, TypeError):
        return None
# function to clean and format the dataset
def preprocess_dataset(dataset):
    cleaned_dataset= []
    for entry in dataset:
        # handle price conversion here and ensure it's a string
        if 'price' in entry and isinstance(entry['price'], str):
            entry['price']= convert_to_float(entry['price'])
        # handle html tags
        if 'description' in entry and isinstance(entry['price'], str):
            entry['description']= remove_html_tags(entry['description'])
        # filter entries without a product ID or title
        if entry.get('asin') and entry.get('title'):
            cleaned_record= {
                "asin": entry.get("asin", ""),
                "title": entry.get("title", ""),
                "feature": entry.get("feature", []),
                "description": entry.get("description", ""),
                "price": entry.get("price"),  
                "imageURL": entry.get("imageURL", ""),
                "highResolutionImageURL": entry.get("highResolutionImageURL", ""),
                "related": entry.get("related", {}),
                "salesRank": entry.get("salesRank", {}),
                "brand": entry.get("brand", ""),
                "categories": entry.get("categories", []),
                "tech1": entry.get("tech1", {}),
                "tech2": entry.get("tech2", {}),
                "similar": entry.get("similar", {}),
            }
            # return cleaned dataset 
            cleaned_dataset.append(cleaned_record)
    return cleaned_dataset

preprocessed_data= preprocess_dataset(sampled_dataset)
# generate a file containing preprocessed dataset
output_filename= '/home/emad/Documents/assgn/preprocessed_dataset.json'
# write the preprocessed data to the JSON file
with open(output_filename, 'w', encoding= 'utf-8') as out_file:
    for entry in preprocessed_data:
        json.dump(entry, out_file)
        out_file.write('\n')  
print("Preprocessed data has been written to:", output_filename)

Preprocessed data has been written to: /home/emad/Documents/assgn/preprocessed_dataset.json


In [10]:
# Define the number of records you want to view
number_of_records_to_view = 5

# Path to the JSON file containing preprocessed data
output_filename = '/home/emad/Documents/assgn/preprocessed_dataset.json'

# Read the JSON file and print the first few records
with open(output_filename, 'r', encoding='utf-8') as file:
    for i in range(number_of_records_to_view):
        line = file.readline()
        # Parse the line as JSON and print
        print(json.loads(line))

# This will output the first five entries in the JSON file


{'asin': '6342509379', 'title': "QIBOE Men's Baggy Jeans Denim Sweatpants Loose Pants", 'feature': ['Denim', 'Zipper closure', 'Material: cotton', 'Style: hip pop', 'Two side slant pockets and two back pockets', 'Straight fit long pants', '<span class="a-text-bold">Shipping Information:\n                    </span>\n                    <span><a href=\'https://www.amazon.com/gp/help/seller/shipping.html/ref=dp_pd_shipping?_encoding=UTF8&amp;seller=A35KZH1XOKRBK&amp;asin=B00UHOT9IS\'>View shipping rates and policies</a></span>', '<span class="a-text-bold">ASIN:\n                    </span>\n                    <span>B00UHOT9IS</span>', '', '<span class="a-text-bold">Date first listed on Amazon:\n                    </span>\n                    <span>March 10, 2015</span>', '<span class="a-text-bold">\n                    Average Customer Review:\n                </span>\n                \n\n\n\n\n<style type="text/css">\n    /* \n    * Fix for UDP-1061. Average customer reviews has a sma

In [9]:
# Function to load a chunk of the sampled dataset
def load_chunk(filename, chunk_size):
    chunk= []
    with open(filename, 'r', encoding='utf-8') as f:
        for _ in range(chunk_size):
            line= f.readline()
            # exit loop if no more data is available
            if not line:
                break  
            chunk.append(json.loads(line.strip()))
    return chunk

chunk_size= 1000
# Process the dataset in batches
while True:
    # Load a chunk of the sampled dataset
    chunk= load_chunk(sampled_dataset_filename, chunk_size)
    if not chunk:
        break  
    # Preprocess the chunk
    preprocessed_chunk= preprocess_dataset(chunk)
    # Print some sample preprocessed records
    print("Sample Preprocessed Records:")
    for record in preprocessed_chunk[:5]:  # Print the first 5 records
        print(json.dumps(record, indent=2))  # Pretty print the record
    # Call the function to check columns in the preprocessed dataset
    preprocessed_columns = set(preprocessed_chunk[0].keys())
    # Print the columns found in the preprocessed dataset
    print("Columns in the preprocessed dataset:")
    for column in sorted(preprocessed_columns):
        print(column)
    break  # Break after processing the first chunk for testing
print("Preprocessing completed.")


Sample Preprocessed Records:
{
  "asin": "6342509379",
  "title": "QIBOE Men's Baggy Jeans Denim Sweatpants Loose Pants",
  "feature": [
    "Denim",
    "Zipper closure",
    "Material: cotton",
    "Style: hip pop",
    "Two side slant pockets and two back pockets",
    "Straight fit long pants",
    "<span class=\"a-text-bold\">Shipping Information:\n                    </span>\n                    <span><a href='https://www.amazon.com/gp/help/seller/shipping.html/ref=dp_pd_shipping?_encoding=UTF8&amp;seller=A35KZH1XOKRBK&amp;asin=B00UHOT9IS'>View shipping rates and policies</a></span>",
    "<span class=\"a-text-bold\">ASIN:\n                    </span>\n                    <span>B00UHOT9IS</span>",
    "",
    "<span class=\"a-text-bold\">Date first listed on Amazon:\n                    </span>\n                    <span>March 10, 2015</span>",
    "<span class=\"a-text-bold\">\n                    Average Customer Review:\n                </span>\n                \n\n\n\n\n<styl