#### Note: This file is taken from the biomedia-mira/mammo-net repository

### Merge clinical and meta data

In [None]:
import os
import pandas as pd
from pathlib import Path

df_meta = pd.read_csv('data/EMBED/tables/EMBED_OpenData_metadata_reduced.csv')
df_clinical = pd.read_csv('data/EMBED/tables/EMBED_OpenData_clinical_reduced.csv')

df_merged = pd.merge(df_meta, df_clinical, on=['empi_anon','acc_anon'])

df_merged['image_path'] = df_merged[['empi_anon', 'anon_dicom_path']].apply(lambda x: os.path.join(str(x[0]), f"{Path(x[1]).stem}.png"), axis=1)

### Split into left/right images, and remove rows where finding and image laterality mismatch

In [None]:
df_left = df_merged[df_merged.ImageLateralityFinal == 'L'].reset_index()
df_right = df_merged[df_merged.ImageLateralityFinal == 'R'].reset_index()
df_left.drop(df_left[df_left.side == 'R'].index, inplace=True)
df_right.drop(df_right[df_right.side == 'L'].index, inplace=True)
df_images = pd.concat([df_left.reset_index(), df_right.reset_index()])
df_images = pd.concat([df_left, df_right], ignore_index=True)
df_images['is_positive'] = 0

In [None]:
df_images.drop(df_images[df_images.asses == 'X'].index, inplace=True)
df_images.drop(df_images[(df_images.asses == 'A') & (df_images.path_severity.isna())].index, inplace=True)

In [None]:
images_path_result = (df_images.ImageLateralityFinal == df_images.bside) | (df_images.bside == 'B')

## Choose one of the next four cells, comment out the others

### Pathology cancer

In [None]:
# df_images.loc[images_path_result & (df_images.path_severity.isin([0,1])), 'is_positive'] = 1
# out_filename = 'data/embed-cancer.csv'

### Pathology cancer & lesions

In [None]:
# df_images.loc[images_path_result & (df_images.path_severity.isin([0,1,2,3])), 'is_positive'] = 1
# out_filename = 'data/embed-pathology.csv'

### Suspicious

In [None]:
# df_images.loc[images_path_result & (df_images.path_severity.isin([0,1,2,3])), 'is_positive'] = 1
# df_images.loc[df_images.asses.isin(['S','M','K']), 'is_positive'] = 1
# out_filename = 'data/embed-suspicious.csv'

### Non-negatives

In [None]:
df_images.loc[images_path_result & (df_images.path_severity.isin([0,1,2,3,4])), 'is_positive'] = 1
df_images.loc[df_images.asses.isin(['B','P','S','M','K']), 'is_positive'] = 1
out_filename = 'data/embed-non-negative.csv'

## Clean up data sample

### Remove duplicates with conflicting positive labels

In [None]:
df_images.sort_values(by=['is_positive'], inplace=True)
df_images.drop_duplicates(subset=['image_path'], keep='last', inplace=True)

In [None]:
df_images.reset_index(inplace=True)

In [None]:
# Backwards compatibility
df_images['image_is_malignant'] = df_images['is_positive']

### Fix ViewPosition

In [None]:
df_images.loc[df_images.ViewPosition.isna(), 'ViewPosition'] = 'None'
df_images.loc[df_images.SeriesDescription.isna(), 'SeriesDescription'] = 'None'
df_images.loc[df_images.SeriesDescription.str.contains('XCC'), 'ViewPosition'] = 'XCC'
# df_images.ViewPosition.value_counts(dropna=False)

images_invalid_view_CC = (df_images.ViewPosition == 'None') & (df_images.SeriesDescription.str.contains('CC'))
df_images.loc[images_invalid_view_CC, 'ViewPosition'] = 'CC'

images_invalid_view_MLO = (df_images.ViewPosition == 'None') & (df_images.SeriesDescription.str.contains('MLO'))
df_images.loc[images_invalid_view_MLO, 'ViewPosition'] = 'MLO'

### Add density and race

In [None]:
df_images['density'] = 'X'
df_images.loc[df_images['tissueden'] == 1, 'density'] = 'A'
df_images.loc[df_images['tissueden'] == 2, 'density'] = 'B'
df_images.loc[df_images['tissueden'] == 3, 'density'] = 'C'
df_images.loc[df_images['tissueden'] == 4, 'density'] = 'D'

In [None]:
df_images['race'] = df_images.RACE_DESC.fillna('Other_or_unknown')
df_images.loc[df_images.race.str.contains('Black'), 'race'] = 'Black'
df_images.loc[df_images.race.str.contains('Asian'), 'race'] = 'Asian'
df_images.loc[df_images.race.str.contains('White'), 'race'] = 'White'
df_images.loc[~df_images.race.isin(['Black', 'White', 'Asian']), 'race'] = 'Other/Unknown'

In [None]:
print('Patients: ', len(df_images.empi_anon.unique()))
print('Findings: ', len(df_images))
print('Images:   ', len(df_images.anon_dicom_path.unique()))
print(df_images['is_positive'].value_counts())
print(df_images['is_positive'].value_counts(normalize=True))

In [None]:
df_images.to_csv(out_filename)

### Check effect of filtering steps that are applied in the datamodule of the classification code

In [None]:
df_filtered = df_images.copy()

In [None]:
df_filtered = df_filtered[df_filtered['FinalImageType'] == '2D']
df_filtered = df_filtered[df_filtered['GENDER_DESC'] == 'Female']
df_filtered = df_filtered[df_filtered['tissueden'].notna()]
df_filtered = df_filtered[df_filtered['tissueden'] < 5]
df_filtered = df_filtered[df_filtered['ViewPosition'].isin(['MLO','CC'])]
df_filtered = df_filtered[df_filtered['spot_mag'].isna()]

print('Patients: ', len(df_filtered.empi_anon.unique()))
print('Findings: ', len(df_filtered))
print('Images:   ', len(df_filtered.anon_dicom_path.unique()))
print(df_filtered['is_positive'].value_counts())
print(df_filtered['is_positive'].value_counts(normalize=True))

**Possible finding**

BIRADS 0: A – Additional evaluation

BIRADS 1: N – Negative

BIRADS 2: B - Benign

BIRADS 3: P – Probably benign

BIRADS 4: S – Suspicious

BIRADS 5: M- Highly suggestive of malignancy

BIRADS 6: K - Known biopsy proven 

**Possible pathology results**

0: invasive cancer

1: non-invasive cancer

2: high-risk lesion

3: borderline lesion

4: benign findings

5: negative (normal breast tissue)

6: non-breast cancer