In [None]:
import os
import pandas as pd
import numpy as np
import shutil
import cv2

def np_CountUpContinuingOnes(b_arr):
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)

    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]

    return right - left - 1

def ExtractBreast(img):
    img_copy = img.copy()
    img = np.where(img <= 40, 0, img)
    height, _ = img.shape
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img = img[:, col_ind]
    _, width = img.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]

    return img_copy[row_ind][:, col_ind]

excel_path = '/Volumes/图图/CDD-CESM/Radiology-manual-annotations.xlsx'
image_dir = '/Volumes/图图/CDD-CESM/PKG - CDD-CESM/CDD-CESM/Low energy images of CDD-CESM'
output_base_dir = '../classification_data/CDD-CESM'
split_csv_path = '../classification_data/classification_split.csv'

split_df = pd.read_csv(split_csv_path)
split_df = split_df[split_df['dataset'] == 'CDD-CESM']

df = pd.read_excel(excel_path, sheet_name='all')

df = df[df['Image_name'].str.contains('DM')]

def process_birads(birads_value):
    if isinstance(birads_value, str) and '$' in birads_value:
        return max(map(int, birads_value.split('$')))
    return int(birads_value)

def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = ExtractBreast(img)
    img_normalized = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)
    return img_normalized

def process_data():
    df['BIRADS'] = df['BIRADS'].apply(process_birads)
    
    merged_df = pd.merge(df, split_df[['data_name', 'data_split']], 
                        left_on='Image_name', right_on='data_name', how='inner')
    
    merged_df = merged_df[['Image_name', 'ACR', 'BIRADS', 'Pathology', 'data_split']]

    for subset in ['Train', 'Eval', 'Test']:
        os.makedirs(os.path.join(output_base_dir, subset), exist_ok=True)

    for _, row in merged_df.iterrows():
        image_name = row['Image_name'].strip()
        image_path = os.path.join(image_dir, image_name + '.jpg')
        if os.path.exists(image_path):
            img = preprocess_image(image_path)

            output_data_dir = os.path.join(output_base_dir, row['data_split'], image_name)
            os.makedirs(output_data_dir, exist_ok=True)

            cv2.imwrite(os.path.join(output_data_dir, 'img.jpg'), img)

            label_dict = {
                'Composition': str(row['ACR']).replace(' ', ''),
                'Bi-Rads': str(row['BIRADS']).replace(' ', ''),
                'Pathology': str(row['Pathology']).replace(' ', ''),
            }
            
            if 'Bi-Rads' in label_dict and label_dict['Bi-Rads'] == '6':
                del label_dict['Bi-Rads']
            if 'Pathology' in label_dict and label_dict['Pathology'] == 'Normal':
                del label_dict['Pathology']
                
            if 'Composition' in label_dict:
                if label_dict['Composition'] in ['A', 'B', 'C', 'D']:
                    label_dict['Composition'] = f"Level {label_dict['Composition']}"
            if 'Bi-Rads' in label_dict:
                if label_dict['Bi-Rads'] in ['1', '2', '3', '4', '5']:
                    label_dict['Bi-Rads'] = f"Bi-Rads {label_dict['Bi-Rads']}"
            
            label_dict = {k: v for k, v in label_dict.items() if v is not None and v != ''}
            
            np.save(os.path.join(output_data_dir, 'info_dict.npy'), label_dict)
        else:
            print(f"Image not found: {image_path}")

process_data()
print("Processing complete.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class_combination'] = df[['ACR', 'BIRADS', 'Pathology']].apply(lambda x: tuple(x), axis=1)
