In [1]:
import os

In [2]:
%pwd

'c:\\Users\\deept\\ShopTalk\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\deept\\ShopTalk'

In [5]:
import pandas as pd

In [6]:
# Test reading the file with line-by-line parsing
try:
    data = pd.read_json("artifacts/data_ingestion/data_tar_extracted/listings/metadata/listings_0.json", lines=True)
    print(data.head())
except ValueError as e:
    print("Error reading JSON:", e)

                                               brand  \
0      [{'language_tag': 'nl_NL', 'value': 'find.'}]   
1  [{'language_tag': 'es_MX', 'value': 'AmazonBas...   
2  [{'language_tag': 'en_AE', 'value': 'AmazonBas...   
3  [{'language_tag': 'en_GB', 'value': 'Stone & B...   
4    [{'language_tag': 'en_AU', 'value': 'The Fix'}]   

                                        bullet_point  \
0  [{'language_tag': 'nl_NL', 'value': 'Schoen in...   
1  [{'language_tag': 'es_MX', 'value': 'White Pow...   
2  [{'language_tag': 'en_AE', 'value': '3D printe...   
3                                                NaN   
4  [{'language_tag': 'en_AU', 'value': 'Embroider...   

                                               color     item_id  \
0  [{'language_tag': 'nl_NL', 'value': 'Veelkleur...  B06X9STHNG   
1  [{'language_tag': 'es_MX', 'value': 'White Pow...  B07P8ML82R   
2  [{'language_tag': 'en_AE', 'value': 'Transluce...  B07H9GMYXS   
3  [{'language_tag': 'en_GB', 'value': 'Stone Bro...  

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9232 entries, 0 to 9231
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                9229 non-null   object
 1   bullet_point         8208 non-null   object
 2   color                7309 non-null   object
 3   item_id              9232 non-null   object
 4   item_name            9232 non-null   object
 5   model_name           5134 non-null   object
 6   model_number         7774 non-null   object
 7   model_year           450 non-null    object
 8   product_type         9232 non-null   object
 9   style                2723 non-null   object
 10  main_image_id        9199 non-null   object
 11  other_image_id       8578 non-null   object
 12  item_keywords        7905 non-null   object
 13  country              9232 non-null   object
 14  marketplace          9232 non-null   object
 15  domain_name          9232 non-null   object
 16  node  

In [8]:
data.isnull().sum()

brand                     3
bullet_point           1024
color                  1923
item_id                   0
item_name                 0
model_name             4098
model_number           1458
model_year             8782
product_type              0
style                  6509
main_image_id            33
other_image_id          654
item_keywords          1327
country                   0
marketplace               0
domain_name               0
node                    413
item_dimensions        6520
item_weight            2609
material               5863
fabric_type            8675
color_code             8129
product_description    8982
spin_id                8701
3dmodel_id             8724
pattern                8962
finish_type            9126
item_shape             8904
dtype: int64

In [9]:
data.shape

(9232, 28)

In [10]:
data.columns

Index(['brand', 'bullet_point', 'color', 'item_id', 'item_name', 'model_name',
       'model_number', 'model_year', 'product_type', 'style', 'main_image_id',
       'other_image_id', 'item_keywords', 'country', 'marketplace',
       'domain_name', 'node', 'item_dimensions', 'item_weight', 'material',
       'fabric_type', 'color_code', 'product_description', 'spin_id',
       '3dmodel_id', 'pattern', 'finish_type', 'item_shape'],
      dtype='object')

In [11]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    untar_data_dir: Path
    all_schema: dict

In [12]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    CONFIG_FILE_PATH = r'C:\Users\deept\ShopTalk\config\config.yaml'
    PARAMS_FILE_PATH = r"C:\Users\deept\ShopTalk\params.yaml"
    SCHEMA_FILE_PATH = r"C:\Users\deept\ShopTalk\schema.yaml"

    def __init__(
        self,
        config_filepath = Path(CONFIG_FILE_PATH),
        params_filepath = Path(PARAMS_FILE_PATH),
        schema_filepath = Path(SCHEMA_FILE_PATH)):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            untar_data_dir = config.untar_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [14]:
import os
from mlProject import logger

In [15]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config


    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Start assuming all is valid

            # List all json files in the directory
            files = [f for f in Path(self.config.untar_data_dir).iterdir() if f.suffix == '.json']

            for file in files:
                data = pd.read_json(file, lines=True)  # Adjust based on your JSON structure
                all_cols = list(data.columns)
                all_schema = self.config.all_schema.keys()

                # Validate columns for each file
                file_status = all(col in all_schema for col in all_cols)
                validation_status = validation_status and file_status  # Update overall status

                # Log the validation status to a file (you could also log per file results)
                with open(self.config.STATUS_FILE, 'a') as f:
                    f.write(f"Validation status for {file.name}: {file_status}\n")

            return validation_status

        except Exception as e:
            print(f"An error occurred: {e}")
            raise  # Consider more specific error handling depending on your needs



In [16]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2024-04-23 22:30:59,991: INFO: common: yaml file: C:\Users\deept\ShopTalk\config\config.yaml loaded successfully]
[2024-04-23 22:30:59,994: INFO: common: yaml file: C:\Users\deept\ShopTalk\params.yaml loaded successfully]
[2024-04-23 22:31:00,012: INFO: common: yaml file: C:\Users\deept\ShopTalk\schema.yaml loaded successfully]
[2024-04-23 22:31:00,015: INFO: common: created directory at: artifacts]
[2024-04-23 22:31:00,017: INFO: common: created directory at: artifacts/data_validation]
