In [1]:
import sys 
sys.path.append("../")
import pandas as pd 
import numpy as np 
import os 
import misc

In [2]:
def gender_convert(value):
    if isinstance(value, str):
        if value.startswith("male"):
            return 0. 
        if value.startswith("female"):
            return 1.
    return float('Nan')

def Histology_convert(value):
    if isinstance(value, str):
        if value.startswith("adenocarcinoma"):
            return 0. 
        if value.startswith("squamous cell carcinoma"):
            return 1. 
        if value.startswith("large cell"):
            return 2. 
        if value.startswith("nos"):
            return 3. 
        if value.startswith("NA"):
            return 4.
    return float('Nan')

def OverallStage_convert(value):
    if isinstance(value, str):
        if value == "I":
            return 0.
        if value == "II":
            return 1. 
        if value == "IIIa":
            return 2.
        if value == "IIIb":
            return 3. 
 
    return float('Nan')

CONVERTERS = {
    'Overall.Stage'   : OverallStage_convert,
    'Histology'       : Histology_convert,
    'gender'          : gender_convert,
}


## Load dataset

In [3]:
#frame = pd.read_excel("./dataset/LC_NSCLC_20200512_(n=317).xlsx", converters=misc.CONVERTERS)
"""
https://wiki.cancerimagingarchive.net/display/Public/NSCLC-Radiomics#1605685425ba360de46d4509a8324498b9c01868
"""
frame = pd.read_csv("D:\\LungCancer\\new dataset\\NSCLC Radiomics Lung1.clinical-version3-Oct 2019.csv", converters=CONVERTERS)
frame.head(5)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,3.0,2.0,0.0,2165,1
1,LUNG1-002,83.8001,2.0,0,0,0.0,1.0,0.0,155,1
2,LUNG1-003,68.1807,2.0,3,0,3.0,2.0,0.0,256,1
3,LUNG1-004,70.8802,2.0,1,0,1.0,1.0,0.0,141,1
4,LUNG1-005,80.4819,4.0,2,0,3.0,1.0,0.0,353,1


## Convert text data to categorical value

In [4]:
histology = misc.to_categorical(frame["Histology"], 5)
gender = misc.to_categorical(frame["gender"], 2)
data = np.concatenate((histology, gender), axis=1)

## Concatenate two frames

In [5]:
columns = ['adenocarcinoma', 'squamous', 'largecell', 'nos', 'NA', 'male', 'female']
frame_new = pd.concat([frame, pd.DataFrame(data, columns=columns)], axis=1)
print(len(frame_new))
frame_new.head(5)

422


Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,adenocarcinoma,squamous,largecell,nos,NA,male,female
0,LUNG1-001,78.7515,2.0,3,0,3.0,2.0,0.0,2165,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,LUNG1-002,83.8001,2.0,0,0,0.0,1.0,0.0,155,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,LUNG1-003,68.1807,2.0,3,0,3.0,2.0,0.0,256,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,LUNG1-004,70.8802,2.0,1,0,1.0,1.0,0.0,141,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,LUNG1-005,80.4819,4.0,2,0,3.0,1.0,0.0,353,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Remove columns with text type

In [6]:
frame_new = frame_new.drop(['Histology', 'gender'], axis=1)
frame_new.head(5)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,adenocarcinoma,squamous,largecell,nos,NA,male,female
0,LUNG1-001,78.7515,2.0,3,0,3.0,2165,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,LUNG1-002,83.8001,2.0,0,0,0.0,155,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,LUNG1-003,68.1807,2.0,3,0,3.0,256,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,LUNG1-004,70.8802,2.0,1,0,1.0,141,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,LUNG1-005,80.4819,4.0,2,0,3.0,353,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Remove patients with missing data

In [7]:

for column in frame_new.columns:
    if frame_new[column].isnull().values.any():
        print(column)
        
list_idx_missingdata = np.where(frame_new.isnull().any(axis=1) == True)[0]
frame_cleaned = frame_new.drop(list_idx_missingdata)
len(frame_cleaned)

age
clinical.T.Stage
Overall.Stage


398

In [8]:
frame_cleaned.to_csv("../dataset/Lung1dataset/Lung1dataset.csv", index=False)