In [3]:
import os

In [4]:
%pwd

'c:\\Users\\deept\\ShopTalk\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'c:\\Users\\deept\\ShopTalk'

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_dir: Path

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    CONFIG_FILE_PATH = r'C:\Users\deept\ShopTalk\config\config.yaml'
    PARAMS_FILE_PATH = r"C:\Users\deept\ShopTalk\params.yaml"
    SCHEMA_FILE_PATH = r"C:\Users\deept\ShopTalk\schema.yaml"

    def __init__(
        self,
        config_filepath = Path(CONFIG_FILE_PATH),
        params_filepath = Path(PARAMS_FILE_PATH),
        schema_filepath = Path(SCHEMA_FILE_PATH)):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
    
        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
        )

        return data_transformation_config

In [10]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [11]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def train_test_spliting(self):
        try:
            # Base directory containing JSON files
            data_dir = Path(self.config.data_dir) 

            # Create Train and Test subfolders
            train_folder = data_dir / "Train"
            test_folder = data_dir / "Test"

            # Ensure Train and Test folders exist
            train_folder.mkdir(exist_ok=True)
            test_folder.mkdir(exist_ok=True)

            # Get list of all JSON files in data_dir
            json_files = [f for f in data_dir.iterdir() if f.suffix == ".json"]

            # Iterate through each JSON file
            for json_file in json_files:
                # Load data from JSON file
                data = pd.read_json(json_file, lines=True)
                
                # Split the data into training and test sets (75% train, 25% test)
                train, test = train_test_split(data, test_size=0.25, random_state=42)

                # Define file paths for train and test JSON files in respective folders
                train_file = train_folder / f"{json_file.stem}_train.json"
                test_file = test_folder / f"{json_file.stem}_test.json"

                # Save the train and test sets to JSON files
                train.to_json(train_file, orient='records', lines=True)
                test.to_json(test_file, orient='records', lines=True)

                # Logging information
                logger.info(f"Split {json_file.name} into train and test sets")
                logger.info(f"Training set shape: {train.shape}")
                logger.info(f"Test set shape: {test.shape}")

                # Print the shapes of the training and test sets
                print(f"Training set shape for {json_file.name}: {train.shape}")
                print(f"Test set shape for {json_file.name}: {test.shape}")

        except Exception as e:
            logger.error("Error during train/test split: ", exc_info=True)
            raise  # Re-raise the exception after logging
     
    import os
    import json
    import gzip
    import pandas as pd

    # Base folder path
    base_folder = "C:/Users/deept/ShopTalk/artifacts"

    # Paths to subfolders
    image_folder = os.path.join(base_folder, "data_ingestion", "abo-images-small", "images", "small")
    metadata_path = os.path.join(base_folder, "data_ingestion", "abo-images-small", "images", "metadata", "images.csv.gz")
    test_csv_path = os.path.join(base_folder, "data_validation", "Test", "processed_Test.csv")
    train_csv_path = os.path.join(base_folder, "data_validation", "Train", "processed_Train.csv")
    test_csv_path_merged = os.path.join(base_folder, "data_validation", "Test", "processed_merged_Test.csv")
    train_csv_path_merged = os.path.join(base_folder, "data_validation", "Train", "processed__merged_Train.csv")

    # Uncompress the CSV file
    uncompressed_csv_path = metadata_path.replace(".gz", "")

    # Uncompress the file and validate columns
    with gzip.open(metadata_path, 'rt') as gz_file:
        csv_content = gz_file.read()

    # Write the uncompressed content to a new file
    with open(uncompressed_csv_path, 'w') as uncompressed_file:
        uncompressed_file.write(csv_content)

    # Read the CSV file into a DataFrame
    image_metadata_df = pd.read_csv(uncompressed_csv_path)
    test_csv_path_df = pd.read_csv(test_csv_path)
    train_csv_path_df = pd.read_csv(train_csv_path)

    image_dataset_test = test_csv_path_df.merge(image_metadata_df, left_on="main_image_id", right_on="image_id")
    image_dataset_train = train_csv_path_df.merge(image_metadata_df, left_on="main_image_id", right_on="image_id")

    image_dataset_test.to_csv(test_csv_path_merged)
    image_dataset_train.to_csv(train_csv_path_merged)   

In [2]:
import json
import os

# Function to concatenate all values in nested JSON objects
def concatenate_nested_values(nested_obj):
    return " ".join([str(v) for v in nested_obj.values()])

# Base path for the source folders
base_path = "C:/Users/deept/ShopTalk/artifacts/data_ingestion/data_tar_extracted/listings/metadata"

# Paths for Test and Train folders
test_folder_path = os.path.join(base_path, "Test")
train_folder_path = os.path.join(base_path, "Train")

# Check if the Test and Train folders exist
if not os.path.exists(test_folder_path):
    raise FileNotFoundError(f"Test folder not found: {test_folder_path}")

if not os.path.exists(train_folder_path):
    raise FileNotFoundError(f"Train folder not found: {train_folder_path}")

# List of keys to check in each JSON object
keys_to_check = ["item_id", "product_type", "brand", "model_name", "item_name_in_en_us", "bullet_point", "color", "style", "main_image_id", "item_keywords"]

# Destination folder for output files
destination_folder = "C:/Users/deept/ShopTalk/artifacts/data_validation"

# Create Test and Train subfolders in the destination folder
os.makedirs(os.path.join(destination_folder, "Test"), exist_ok=True)
os.makedirs(os.path.join(destination_folder, "Train"), exist_ok=True)

# Helper function to process JSON files from a given folder and output them to a destination folder
def process_json_files(source_folder, output_folder):
    processed_data = []

    # Get all JSON files in the source folder
    json_files = [f for f in os.listdir(source_folder) if f.endswith(".json")]

    for json_file in json_files:
        file_path = os.path.join(source_folder, json_file)

        # Read the file line by line to avoid JSON decoding errors
        with open(file_path, 'r') as f:
            lines = f.readlines()

            for line in lines:
                try:
                    json_obj = json.loads(line)  # Load the JSON object from the line
                    new_obj = {}
                    for key in keys_to_check:
                        if key in json_obj:
                            value = json_obj[key]
                            if isinstance(value, list):
                                # If it's a list of nested JSON objects, concatenate their values
                                concatenated_string = " ".join([concatenate_nested_values(v) for v in value])
                                new_obj[key] = concatenated_string
                            elif isinstance(value, dict):
                                # If it's a single nested JSON object, concatenate its values
                                new_obj[key] = concatenate_nested_values(value)
                            else:
                                # If it's a simple value, store it directly
                                new_obj[key] = value
                    processed_data.append(new_obj)
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line: {line}")

    # Output the processed data to a new file in the output folder
    output_file_path = os.path.join(output_folder, f"processed_{os.path.basename(source_folder)}.json")

    with open(output_file_path, 'w') as f:
        json.dump(processed_data, f, indent=4)  # Save with pretty-printing
    
    return output_file_path

# Process the Test and Train folders and store the results in respective subfolders
test_output_path = process_json_files(test_folder_path, os.path.join(destination_folder, "Test"))
train_output_path = process_json_files(train_folder_path, os.path.join(destination_folder, "Train"))

# Return the paths to the created output files
test_output_path, train_output_path  # These indicate where the files are stored

('C:/Users/deept/ShopTalk/artifacts/data_validation\\Test\\processed_Test.json',
 'C:/Users/deept/ShopTalk/artifacts/data_validation\\Train\\processed_Train.json')

In [4]:
import os
import json
import pandas as pd

# Base path for the source folders
base_path = "C:/Users/deept/ShopTalk/artifacts/data_ingestion/data_tar_extracted/listings/metadata"

# Paths for Test and Train folders
test_folder_path = os.path.join(base_path, "Test")
train_folder_path = os.path.join(base_path, "Train")

# Check if the Test and Train folders exist
if not os.path.exists(test_folder_path):
    raise FileNotFoundError(f"Test folder not found: {test_folder_path}")

if not os.path.exists(train_folder_path):
    raise FileNotFoundError(f"Train folder not found: {train_folder_path}")

# List of keys to check in each JSON object
keys_to_check = ["item_id", "product_type", "brand", "model_name", "item_name_in_en_us", "bullet_point", "color", "style", "main_image_id", "item_keywords"]

# Destination folder for output files
destination_folder = "C:/Users/deept/ShopTalk/artifacts/data_validation"

# Create Test and Train subfolders in the destination folder
os.makedirs(os.path.join(destination_folder, "Test"), exist_ok=True)
os.makedirs(os.path.join(destination_folder, "Train"), exist_ok=True)

# Function to concatenate all values in nested objects (list or dict)
def concatenate_nested_values(nested_obj):
    if isinstance(nested_obj, dict):
        return " ".join([str(v) for v in nested_obj.values()])
    elif isinstance(nested_obj, list):
        return " ".join([str(v) for v in nested_obj])
    return str(nested_obj)

# Helper function to process JSON files from a given folder and output them to a CSV file in the destination folder
def process_json_files_to_csv(source_folder, output_folder):
    data_list = []

    # Get all JSON files in the source folder
    json_files = [f for f in os.listdir(source_folder) if f.endswith(".json")]

    for json_file in json_files:
        file_path = os.path.join(source_folder, json_file)

        # Read the file line by line to avoid JSON decoding errors
        with open(file_path, 'r') as f:
            lines = f.readlines()

            for line in lines:
                try:
                    json_obj = json.loads(line)  # Load the JSON object from the line
                    processed_obj = {}
                    for key in keys_to_check:
                        if key in json_obj:
                            value = json_obj[key]
                            if isinstance(value, list):
                                # If it's a list, concatenate their values
                                processed_obj[key] = concatenate_nested_values(value)
                            elif isinstance(value, dict):
                                # If it's a dictionary, concatenate its values
                                processed_obj[key] = concatenate_nested_values(value)
                            else:
                                # If it's a simple value, store it directly
                                processed_obj[key] = value
                    data_list.append(processed_obj)
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line: {line}")

    # Create a DataFrame and output it to CSV
    df = pd.DataFrame(data_list)

    # CSV output path
    output_file_path = os.path.join(output_folder, f"processed_{os.path.basename(source_folder)}.csv")

    df.to_csv(output_file_path, index=False)  # Save without index
    
    return output_file_path

# Process the Test and Train folders and store the results in respective subfolders
test_output_path = process_json_files_to_csv(test_folder_path, os.path.join(destination_folder, "Test"))
train_output_path = process_json_files_to_csv(train_folder_path, os.path.join(destination_folder, "Train"))

# Return the paths to the created output files
test_output_path, train_output_path  # These indicate where the files are stored


('C:/Users/deept/ShopTalk/artifacts/data_validation\\Test\\processed_Test.csv',
 'C:/Users/deept/ShopTalk/artifacts/data_validation\\Train\\processed_Train.csv')

In [12]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-04-23 22:32:09,039: INFO: common: yaml file: C:\Users\deept\ShopTalk\config\config.yaml loaded successfully]
[2024-04-23 22:32:09,041: INFO: common: yaml file: C:\Users\deept\ShopTalk\params.yaml loaded successfully]
[2024-04-23 22:32:09,053: INFO: common: yaml file: C:\Users\deept\ShopTalk\schema.yaml loaded successfully]
[2024-04-23 22:32:09,055: INFO: common: created directory at: artifacts]
[2024-04-23 22:32:09,057: INFO: common: created directory at: artifacts/data_validation]
[2024-04-23 22:32:11,110: INFO: 911997196: Split listings_0.json into train and test sets]
[2024-04-23 22:32:11,111: INFO: 911997196: Training set shape: (6924, 28)]
[2024-04-23 22:32:11,112: INFO: 911997196: Test set shape: (2308, 28)]
Training set shape for listings_0.json: (6924, 28)
Test set shape for listings_0.json: (2308, 28)
[2024-04-23 22:32:13,584: INFO: 911997196: Split listings_1.json into train and test sets]
[2024-04-23 22:32:13,585: INFO: 911997196: Training set shape: (6924, 28)]
[2024-