In [1]:
import os
import pandas as pd
import re
from pathlib import Path

# Assuming mlProject.logger is a module you have that sets up logging
from mlProject import logger

data_dir = r'C:\Users\deept\ShopTalk\artifacts\data_ingestion'
destination_folder = r'C:\Users\deept\ShopTalk\artifacts\data_validation'

csv_file_path = os.path.join(data_dir, "processed_dataset_target_data_with_captions_only.csv")

# Load the CSV data into a DataFrame
df = pd.read_csv(csv_file_path)

testdata_csv_path = os.path.join(destination_folder, "Merged_Testdata_validation.csv")
traindata_csv_path = os.path.join(destination_folder, "Merged_Traindata_validation.csv")

# Output the processed data to a new file in the output folder
test_output_path = os.path.join(destination_folder, f"Merged_Test{Path(destination_folder).name}.csv")
train_output_path = os.path.join(destination_folder, f"Merged_Train{Path(destination_folder).name}.csv")

# Read the test and train CSV files into DataFrames
test_csv_path_df = pd.read_csv(testdata_csv_path)
train_csv_path_df = pd.read_csv(traindata_csv_path)

# Perform the merge operation
image_dataset_test = test_csv_path_df.merge(df, on="item_id", suffixes=('', '_drop'))
image_dataset_train = train_csv_path_df.merge(df, on="item_id", suffixes=('', '_drop'))

# Remove columns where the first word before an underscore is duplicated
def remove_duplicate_prefix_columns(df):
    columns = df.columns
    prefix_dict = {}
    columns_to_drop = []
    for col in columns:
        prefix = col.split('_')[0]
        if prefix in prefix_dict:
            columns_to_drop.append(col)
        else:
            prefix_dict[prefix] = 1
    df.drop(columns=columns_to_drop, inplace=True)

remove_duplicate_prefix_columns(image_dataset_test)
remove_duplicate_prefix_columns(image_dataset_train)

# Concatenate 'path' and 'captions' columns, with 'captions' at the end
image_dataset_train['combined'] = image_dataset_train['path'].astype(str) + ' ' + image_dataset_train['captions'].astype(str)
image_dataset_test['combined'] = image_dataset_test['path'].astype(str) + ' ' + image_dataset_test['captions'].astype(str)

# Remove special characters from 'combined' column, except file path characters
image_dataset_train['combined'] = image_dataset_train['combined'].apply(lambda x: re.sub(r'[^\w\s/\\:.]', '', x))
image_dataset_test['combined'] = image_dataset_test['combined'].apply(lambda x: re.sub(r'[^\w\s/\\:.]', '', x))

# Make 'combined' the last column
cols_test = image_dataset_test.columns.tolist()
cols_test.append(cols_test.pop(cols_test.index('combined')))
image_dataset_test = image_dataset_test[cols_test]

cols_train = image_dataset_train.columns.tolist()
cols_train.append(cols_train.pop(cols_train.index('combined')))
image_dataset_train = image_dataset_train[cols_train]

# Save the merged and updated DataFrames to CSV files
image_dataset_test.to_csv(test_output_path, index=False)
image_dataset_train.to_csv(train_output_path, index=False)


KeyError: 'path'