# IFN-646 Project: Images Classification of Breast Cancer

## Description

In this file, we will perform how we run the classifiers to classify which case considered as **BENIGN** or **MALIGNANT**.

### Data Cleaning and Wrangling

In this section of code, we will do the Data cleaning and Wrangling including do the stratified sampling.

In [1]:
#Import Library needed
import pandas as pd
import os
import re

In [2]:
#Function to set the subject ID for calc and mass dataset
def generate_full_mask(name, file):
    df_full = pd.read_csv(file)
    df_full[['assessment', 'abnormality id']] = df_full[['assessment', 'abnormality id']].astype(str)
    df_full['Subject ID'] = name+df_full['patient_id']+'_'+df_full['left or right breast']+'_'+df_full['image view']
    df_mask = pd.read_csv(file)
    df_mask[['assessment', 'abnormality id']] = df_mask[['assessment', 'abnormality id']].astype(str)
    df_mask['Subject ID'] = name+df_mask['patient_id']+'_'+df_mask['left or right breast']+'_'+df_mask['image view']+'_'+df_mask['abnormality id']
    df_full = df_full.drop_duplicates(subset=['Subject ID'], keep='first')
    df_mask = df_mask.drop_duplicates(subset=['Subject ID'], keep='first')
    return df_full, df_mask

#Function to get the pathfile
def get_name(directory):

    names = []

    for root, dirs, files in os.walk(directory):
        for filename in files:
            _, ext = os.path.splitext(filename)
            if ext in ['.png']:
                names.append(os.path.join(root, filename))

    return names

def merge_path(data1, data2):
    #Select necessary column to match with metadata from calc and mass dataset
    data1 = data1.iloc[:,[8,9,14]]
    data2 = data2.iloc[:,[8,9,14]]
    merge1 = pd.merge(data1, result, on='Subject ID', how='inner')
    merge2 = pd.merge(data2, result, on='Subject ID', how='inner')
    merge = pd.concat([merge1, merge2])
    #Only select the mask file, which start with 1-2.dcm (Thus keep the last)
    merge = merge.drop_duplicates(subset=['Subject ID'], keep='last')
    #Re-index the dataframe
    merge = merge.reindex(columns=['Subject ID','assessment','pathology','Match','Pathfile'])
    #Change the Benign Without Callback to Benign
    merge['pathology'] = merge['pathology'].replace(['BENIGN_WITHOUT_CALLBACK'],'BENIGN')
    return merge

In [3]:
#Initiate dataset
calc_train_full, calc_train_mask = generate_full_mask('Calc-Training_','calc_case_description_train_set.csv')
calc_test_full, calc_test_mask = generate_full_mask('Calc-Test_', 'calc_case_description_test_set.csv')
mass_train_full, mass_train_mask = generate_full_mask('Mass-Training_','mass_case_description_train_set.csv')
mass_test_full, mass_test_mask = generate_full_mask('Mass-Test_','mass_case_description_test_set.csv')
metadata = pd.read_csv('metadata.csv')

In [4]:
#Initiate the variable to get the pathfile for each images
names = get_name('CBIS-DDSM')
#Make it into dataframe
names = pd.DataFrame(names,columns=['Pathfile'])
#Make the Match Column to match with File Location in metadata
names['Match'] = names['Pathfile'].str[:-12]
#Make a new column called Match for joining the metadata with pathfile from names
metadata['Match'] = metadata['File Location'].str[2:]
#Drop unnecessary column from metadata
metadata = metadata.iloc[:,[4,17]]
#Merge the new PNG pathfile to metadata
result = pd.merge(metadata, names, on='Match', how='inner')

In [9]:
#Pathfile for each dataset
calc_train = merge_path(calc_train_full,calc_train_mask)
calc_test = merge_path(calc_test_full,calc_test_mask)
mass_train = merge_path(mass_train_full,mass_train_mask)
mass_test = merge_path(mass_test_full,mass_test_mask)

## Data Split