# Malignancy level dataframe creation

The aim of this notebook is to create and save a dataframe with the SeriesInstanceUid of each nodule and the associated malignancy level. This value isn't availale in the LUNA16 annotations file so these informations are taken from the annotations of LIDC-IDRI dataset (in particular the 'LIDC-XML-only' folder, the dataset and annoations files are available here: https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=1966254)

In [1]:
import os

import pandas as pd

from bs4 import BeautifulSoup

In [9]:
# extraction of id and malignancy level from each xml file
folder_path = 'LIDC-XML-only'
folders = os.listdir(folder_path)

id_list = []
malignancy_list = []

for f in folders:
    print(f'Processing folder "{f}"')
    dir_list = os.listdir(folder_path + '/' + f)
   
    for xml_file in dir_list:
        xml_file_path = folder_path + '/' + f + '/' + xml_file
        
        with open(xml_file_path, 'r') as xml_f:
            data = xml_f.read()

            Bs_data = BeautifulSoup(data, 'html.parser')

            try:
                malignancy = int(Bs_data.find('malignancy').get_text())
            except:
                malignancy = 0 # if malignancy = 0, missing value
                
            uid = Bs_data.find('seriesinstanceuid').get_text()
            
            id_list.append(uid)
            malignancy_list.append(malignancy)
            

Processing folder "157"
Processing folder "185"
Processing folder "186"
Processing folder "187"
Processing folder "188"
Processing folder "189"


In [31]:
# dataframe creation
data_dict = {
    'seriesuid': id_list,
    'malignancy_Level': malignancy_list
}

df = pd.DataFrame(data_dict)
df

Unnamed: 0,seriesuid,malignancy_Level
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.303494235102...,3
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905...,5
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.213233719488...,3
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.340202188094...,4
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.311102747717...,2
...,...,...
1313,1.3.6.1.4.1.14519.5.2.1.6279.6001.175773779529...,3
1314,1.3.6.1.4.1.14519.5.2.1.6279.6001.293593766328...,4
1315,1.3.6.1.4.1.14519.5.2.1.6279.6001.133132722052...,2
1316,1.3.6.1.4.1.14519.5.2.1.6279.6001.229343399861...,2


In [30]:
# save the dataframe
df.to_csv('malignancy_annotations.csv', sep=';')