In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import json
import os
import pandas as pd

In [3]:
root = '/content/drive/My Drive/Project/Data Pipeline'
f = open(os.path.join(root, 'ctg-studies.json'))
data = json.load(f)
f.close()

with open('subject.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)
json_df = pd.read_json('subject.json')

In [4]:
# Extract nct id and conditions
json_df = json_df[['protocolSection']]
json_df['nctID'] = json_df['protocolSection'].apply(lambda x: x['identificationModule']['nctId'])
json_df['conditions'] = json_df['protocolSection'].apply(lambda x: x['conditionsModule']['conditions'])

In [5]:
# Make condition list into string to be able to search for substrings
def flatten_list(conditions):
  cond_str = ''
  for i, item in enumerate(conditions):
    if i == len(conditions)-1:
      cond_str = cond_str + item
    else:
      cond_str = cond_str + item + ' '
  return cond_str

In [6]:
json_df

Unnamed: 0,protocolSection,nctID,conditions
0,{'identificationModule': {'nctId': 'NCT0108039...,NCT01080391,[Relapsed Multiple Myeloma]
1,{'identificationModule': {'nctId': 'NCT0216199...,NCT02161991,"[Carcinoma, Non-small Cell Lung]"
2,{'identificationModule': {'nctId': 'NCT0091139...,NCT00911391,"[Colorectal Cancer, Colectomy, Surgery, Periop..."
3,{'identificationModule': {'nctId': 'NCT0024256...,NCT00242567,[Prostate Cancer]
4,{'identificationModule': {'nctId': 'NCT0461466...,NCT04614662,"[Pediatric Cancer, Quality of Life]"
...,...,...,...
2866,{'identificationModule': {'nctId': 'NCT0286815...,NCT02868151,[Oral Mucositis]
2867,{'identificationModule': {'nctId': 'NCT0286425...,NCT02864251,[Non-Small-Cell Lung Carcinoma]
2868,{'identificationModule': {'nctId': 'NCT0264915...,NCT02649153,[Liver Cancer]
2869,{'identificationModule': {'nctId': 'NCT0153505...,NCT01535053,"[Choriocarcinoma, FIGO Stage I Gestational Tro..."


In [8]:
# Prep df for category assignment
json_df['conditions'] = json_df['conditions'].apply(lambda x: flatten_list(x))
conditions_df = json_df.drop(columns=['protocolSection'])
conditions_df['conditions'] = conditions_df['conditions'].str.lower()

In [9]:
json_df

Unnamed: 0,protocolSection,nctID,conditions
0,{'identificationModule': {'nctId': 'NCT0108039...,NCT01080391,Relapsed Multiple Myeloma
1,{'identificationModule': {'nctId': 'NCT0216199...,NCT02161991,"Carcinoma, Non-small Cell Lung"
2,{'identificationModule': {'nctId': 'NCT0091139...,NCT00911391,Colorectal Cancer Colectomy Surgery Perioperat...
3,{'identificationModule': {'nctId': 'NCT0024256...,NCT00242567,Prostate Cancer
4,{'identificationModule': {'nctId': 'NCT0461466...,NCT04614662,Pediatric Cancer Quality of Life
...,...,...,...
2866,{'identificationModule': {'nctId': 'NCT0286815...,NCT02868151,Oral Mucositis
2867,{'identificationModule': {'nctId': 'NCT0286425...,NCT02864251,Non-Small-Cell Lung Carcinoma
2868,{'identificationModule': {'nctId': 'NCT0264915...,NCT02649153,Liver Cancer
2869,{'identificationModule': {'nctId': 'NCT0153505...,NCT01535053,Choriocarcinoma FIGO Stage I Gestational Troph...


In [10]:
conditions_df['conditions']

0                               relapsed multiple myeloma
1                          carcinoma, non-small cell lung
2       colorectal cancer colectomy surgery perioperat...
3                                         prostate cancer
4                        pediatric cancer quality of life
                              ...                        
2866                                       oral mucositis
2867                        non-small-cell lung carcinoma
2868                                         liver cancer
2869    choriocarcinoma figo stage i gestational troph...
2870                                   malignant neoplasm
Name: conditions, Length: 2871, dtype: object

In [9]:
def conditions_map(condition):
  if 'cell lung' in condition:
    return 'squamous cell'
  if 'head and neck' in condition:
    return 'squamous cell'
  if 'squamous cell' in condition:
    return 'squamous cell'
  if 'small cell' in condition:
    return 'squamous cell'
  if 'lung' in condition:
    return 'squamous cell'
  if 'keratosis' in condition:
    return 'squamous cell'
  if 'myeloma' in condition:
    return 'myeloma'
  if 'sarcoma' in condition:
    return 'sarcoma'
  if 'lymphoma' in condition:
    return 'lymphoma'
  if 'brain cancer' in condition:
    return 'brain'
  if 'melanoma' in condition:
    return 'melanoma'
  if 'adenocarcinoma' in condition:
    return 'adeno'
  if 'prostate cancer' in condition:
    return 'adeno'
  if 'breast' in condition:
    return 'ductal'
  if 'leukemia' in condition:
    return 'leukemia'
  if 'colorectal' in condition:
    return 'adeno'
  if 'glioblastoma' in condition:
    return 'brain'
  if 'kidney' in condition:
    return 'adeno'
  if 'renal' in condition:
    return 'adeno'
  if 'hematopoietic' in condition:
    return 'leukemia'
  if 'lymphoid' in condition:
    return 'lymphoma'
  if 'cervix' in condition:
    return 'adeno'
  if 'cervical' in condition:
    return 'adeno'
  if 'liver' in condition:
    return 'adeno'
  if 'hepatic' in condition:
    return 'adeno'
  if 'hepatocellular' in condition:
    return 'adeno'
  if 'nsclc' in condition:
    return 'squamous cell'
  if 'thyroid' in condition:
    return 'adeno'
  if 'pain' in condition:
    return 'pain'
  elif 'carcinoma' in condition:
    return 'carcinoma'
  return 'other'

In [10]:
# Assign conditions to category
conditions_df['conditions_category'] = conditions_df['conditions'].apply(lambda x: conditions_map(x))

In [11]:
# Dict to assign category to int
category_map = {'myeloma': 0, 'squamous cell': 1, 'adeno': 2, 'carcinoma': 3, 'leukemia': 4, 'ductal': 5, 'sarcoma': 6, 'lymphoma': 7, 'melanoma': 8,
                'brain': 9, 'pain': 10, 'other': 11}

In [12]:
# Create new column for category as int
conditions_df['conditions_category_num'] = conditions_df['conditions_category'].map(category_map)

In [13]:
# 5 year survival dict
conditions_5yr_survival_map = {'myeloma': 0.598, 'squamous cell': 0.99, 'adeno': 0.175, 'carcinoma': 0.99, 'leukemia': 0.65, 'ductal': 0.99, 'sarcoma': 0.65,
                                    'lymphoma': 0.83, 'melanoma': 0.94, 'brain': 0.326, 'pain': 0.68, 'other': 0.68}

In [14]:
conditions_df['survival_5yr_relative'] = conditions_df['conditions_category'].map(conditions_5yr_survival_map)

In [15]:
# Min and max treatment duration dicts
conditions_max_treatment_duration_map = {'myeloma': 180, 'squamous cell': 49, 'adeno': 1080, 'carcinoma': 1440, 'leukemia': 1095, 'ductal': 1825, 'sarcoma': 1825,
                                         'lymphoma': 730, 'melanoma': 730, 'brain': 4320, 'pain': 4320, 'other': 4320}
conditions_min_treatment_duration_map = {'myeloma': 90, 'squamous cell': 14, 'adeno': 360, 'carcinoma': 360, 'leukemia': 730, 'ductal': 365, 'sarcoma': 240,
                                         'lymphoma': 180, 'melanoma': 150, 'brain': 1080, 'pain': 14, 'other': 14}

In [16]:
# Create treatment duration columns
conditions_df['max_treatment_duration'] = conditions_df['conditions_category'].map(conditions_max_treatment_duration_map)
conditions_df['min_treatment_duration'] = conditions_df['conditions_category'].map(conditions_min_treatment_duration_map)

In [17]:
# Drop original 'conditions' column and export
conditions_df = conditions_df.drop(columns=['conditions'])
conditions_df.to_csv('conditions_features.csv', index=False)