In [1]:
import os
import pickle
import requests
import numpy as np
import pandas as pd

In [2]:
data = np.genfromtxt('fields_by_age.txt.gz', dtype=np.uint32, delimiter='\t')

In [3]:
data.shape

(5295794, 3)

In [4]:
data[:10]

array([[    1, 19868,   198],
       [    1, 21131,   263],
       [    1, 20510,   293],
       [    1, 19169,   299],
       [    1, 20510,   301],
       [    1, 21131,   370],
       [    1, 20758,   382],
       [    1, 19169,   806],
       [    1, 21131,   911],
       [    2, 27528,   293]], dtype=uint32)

In [5]:
n_disease = data[:,2].max()
n_disease

1129

In [6]:
sex = pd.read_csv('sex.txt', sep='\t').iloc[:-1,:]
sex

Unnamed: 0,patient,f.31.0.0
0,1,0.0
1,2,0.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
502485,502486,0.0
502486,502487,1.0
502487,502488,0.0
502488,502489,1.0


In [8]:
data = np.vstack([data + np.array([0,0,2]), np.array([sex.patient.to_numpy(), np.repeat(0, sex.shape[0]), 1+sex['f.31.0.0'].to_numpy().astype(np.uint32)]).T])

In [9]:
n_disease = n_disease+2

In [10]:
data

array([[     1,  19868,    200],
       [     1,  21131,    265],
       [     1,  20510,    295],
       ...,
       [502488,      0,      1],
       [502489,      0,      2],
       [502490,      0,      1]])

In [11]:
cancers = pd.read_csv('cancers.txt', sep='\t')
cancers['cancer'] = pd.Categorical(cancers['f.40006.0.0'].str.slice(0,3))

In [12]:
n_cancer = len(cancers.cancer.cat.categories)

In [13]:
cancers.iloc[15:18,:]

Unnamed: 0,patient,f.40006.0.0,aod,cancer
15,69,,13995,
16,70,C504,23717,C50
17,71,,14519,


In [14]:
data = np.vstack([data, np.array([cancers.patient.to_numpy(), cancers.aod.to_numpy(), cancers.cancer.cat.codes.to_numpy()+2+n_disease]).T])

In [15]:
deaths = pd.read_csv('death.txt', sep='\t')
deaths

Unnamed: 0,patient,death,aod
0,47,1,25673
1,61,1,25206
2,69,1,24969
3,88,1,28852
4,105,1,23134
...,...,...,...
30198,502431,1,27769
30199,502435,1,29866
30200,502443,1,22688
30201,502457,1,28549


In [16]:
data = np.vstack([data, np.array([deaths.patient.to_numpy(), deaths.aod.to_numpy(), deaths.death+1+n_disease+n_cancer]).T])

In [17]:
data

array([[     1,  19868,    200],
       [     1,  21131,    265],
       [     1,  20510,    295],
       ...,
       [502443,  22688,   1259],
       [502457,  28549,   1259],
       [502489,  24934,   1259]])

In [18]:
data.max()

4294954573

In [19]:
data= data[np.argsort(data[:,0] + data[:,1].astype('float')/100000 + (data[:,2]==data[:,2].max()))]

In [20]:
data

array([[         1,          0,          1],
       [         1,      17900,       1172],
       [         1,      19169,        808],
       ...,
       [    501663, 4294950190,        715],
       [    502242, 4294953814,        896],
       [    502334, 4294947970,        879]])

In [21]:
np.max(data[:,1])

4294954573

In [22]:
np.sum(data[:,1] >  365*150)

1482

In [23]:
data.max(0)

array([    502490, 4294954573,       1259])

In [24]:
data = data[data[:,1] < 365*120,:].astype('uint32')

In [25]:
data.max(0)

array([502490,  33790,   1259], dtype=uint32)

In [26]:
data.min(0)

array([1, 0, 1], dtype=uint32)

In [27]:
# export to bin files
train_val_split = data[:,0] <= 400000
data[train_val_split].tofile( 'train.bin')
data[~train_val_split].tofile('val.bin')

# save the meta information as well, to help us encode/decode later
vocab_size = data.max(0)[2] + 2
meta = {
    'vocab_size': vocab_size,
    'itos': {i:i for i in range(vocab_size)},
    'stoi': {i:i for i in range(vocab_size)},
}
with open('meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

In [28]:
data[~train_val_split].shape

(1208102, 3)

In [29]:
data[:10,][-1]

array([    1, 21131,   265], dtype=uint32)

In [30]:
data[10]

array([    1, 21131,   913], dtype=uint32)

In [31]:
train_val_split.sum()

4708659

In [32]:
data[:,0].max()

502490

In [33]:
vocab_size

1261

In [34]:
import re

In [35]:
foo = re.sub("([0-9]+|-)\n","\t",
"""C00-C14 Malignant neoplasms of lip, oral cavity and pharynx-
C00 Malignant neoplasm of lip65
C01 Malignant neoplasm of base of tongue231
C02 Malignant neoplasm of other and unspecified parts of tongue339
C03 Malignant neoplasm of gum109
C04 Malignant neoplasm of floor of mouth83
C05 Malignant neoplasm of palate102
C06 Malignant neoplasm of other and unspecified parts of mouth122
C07 Malignant neoplasm of parotid gland152
C08 Malignant neoplasm of other and unspecified major salivary glands42
C09 Malignant neoplasm of tonsil401
C10 Malignant neoplasm of oropharynx68
C11 Malignant neoplasm of nasopharynx56
C12 Malignant neoplasm of pyriform sinus51
C13 Malignant neoplasm of hypopharynx41
C14 Malignant neoplasm of other and ill-defined sites in the lip, oral cavity and pharynx28
C15-C26 Malignant neoplasms of digestive organs-
C15 Malignant neoplasm of oesophagus1395
C16 Malignant neoplasm of stomach1035
C17 Malignant neoplasm of small intestine398
C18 Malignant neoplasm of colon6246
C19 Malignant neoplasm of rectosigmoid junction642
C20 Malignant neoplasm of rectum2703
C21 Malignant neoplasm of anus and anal canal336
C22 Malignant neoplasm of liver and intrahepatic bile ducts810
C23 Malignant neoplasm of gallbladder164
C24 Malignant neoplasm of other and unspecified parts of biliary tract266
C25 Malignant neoplasm of pancreas1648
C26 Malignant neoplasm of other and ill-defined digestive organs45
C30-C39 Malignant neoplasms of respiratory and intrathoracic organs-
C30 Malignant neoplasm of nasal cavity and middle ear68
C31 Malignant neoplasm of accessory sinuses30
C32 Malignant neoplasm of larynx419
C33 Malignant neoplasm of trachea9
C34 Malignant neoplasm of bronchus and lung5596
C37 Malignant neoplasm of thymus62
C38 Malignant neoplasm of heart, mediastinum and pleura25
C39 Malignant neoplasm of other and ill-defined sites in the respiratory system and intrathoracic organs4
C40-C41 Malignant neoplasms of bone and articular cartilage-
C40 Malignant neoplasm of bone and articular cartilage of limbs69
C41 Malignant neoplasm of bone and articular cartilage of other and unspecified sites56
C42 hematopoietic and reticuloendothelial systems (ICD-O-3 specific)46
C43-C44 Melanoma and other malignant neoplasms of skin-
C43 Malignant melanoma of skin5575
C44 Other malignant neoplasms of skin40662
C45-C49 Malignant neoplasms of mesothelial and soft tissue-
C45 Mesothelioma509
C46 Kaposi's sarcoma24
C47 Malignant neoplasm of peripheral nerves and autonomic nervous system23
C48 Malignant neoplasm of retroperitoneum and peritoneum227
C49 Malignant neoplasm of other connective and soft tissue487
C50-C50 Malignant neoplasm of breast-
C50 Malignant neoplasm of breast20813
C51-C58 Malignant neoplasms of female genital organs-
C51 Malignant neoplasm of vulva241
C52 Malignant neoplasm of vagina48
C53 Malignant neoplasm of cervix uteri613
C54 Malignant neoplasm of corpus uteri2754
C55 Malignant neoplasm of uterus, part unspecified66
C56 Malignant neoplasm of ovary1860
C57 Malignant neoplasm of other and unspecified female genital organs190
C58 Malignant neoplasm of placenta13
C60-C63 Malignant neoplasms of male genital organs-
C60 Malignant neoplasm of penis114
C61 Malignant neoplasm of prostate16001
C62 Malignant neoplasm of testis625
C63 Malignant neoplasm of other and unspecified male genital organs25
C64-C68 Malignant neoplasms of urinary tract-
C64 Malignant neoplasm of kidney, except renal pelvis2302
C65 Malignant neoplasm of renal pelvis147
C66 Malignant neoplasm of ureter152
C67 Malignant neoplasm of bladder2044
C68 Malignant neoplasm of other and unspecified urinary organs46
C69-C72 Malignant neoplasms of eye, brain and other parts of central nervous system-
C69 Malignant neoplasm of eye and adnexa229
C70 Malignant neoplasm of meninges24
C71 Malignant neoplasm of brain1060
C72 Malignant neoplasm of spinal cord, cranial nerves and other parts of central nervous system35
C73-C75 Malignant neoplasms of thyroid and other endocrine glands-
C73 Malignant neoplasm of thyroid gland928
C74 Malignant neoplasm of adrenal gland46
C75 Malignant neoplasm of other endocrine glands and related structures27
C76-C80 Malignant neoplasms of ill-defined, secondary and unspecified sites-
C76 Malignant neoplasm of other and ill-defined sites61
C77 Secondary and unspecified malignant neoplasm of lymph nodes203
C78 Secondary malignant neoplasm of respiratory and digestive organs248
C79 Secondary malignant neoplasm of other sites114
C80 Malignant neoplasm without specification of site427
C81-C96 Malignant neoplasms, stated or presumed to be primary, of lymphoid, haematopoietic and related tissue-
C81 Hodgkin's disease449
C82 Follicular [nodular] non-Hodgkin's lymphoma868
C83 Diffuse non-Hodgkin's lymphoma1777
C84 Peripheral and cutaneous T-cell lymphomas294
C85 Other and unspecified types of non-Hodgkin's lymphoma846
C86 Other specified types of T/NK-cell lymphoma7
C88 Malignant immunoproliferative diseases92
C90 Multiple myeloma and malignant plasma cell neoplasms1320
C91 Lymphoid leukaemia1232
C92 Myeloid leukaemia814
C93 Monocytic leukaemia20
C94 Other leukaemias of specified cell type15
C95 Leukaemia of unspecified cell type28
C96 Other and unspecified malignant neoplasms of lymphoid, haematopoietic and related tissue54
C97-C97 Malignant neoplasms of independent (primary) multiple sites-
C97 Malignant neoplasms of independent (primary) multiple sites1
D00-D09 In situ neoplasms-
D00 Carcinoma in situ of oral cavity, oesophagus and stomach165
D01 Carcinoma in situ of other and unspecified digestive organs880
D02 Carcinoma in situ of middle ear and respiratory system106
D03 Melanoma in situ2399
D04 Carcinoma in situ of skin1816
D05 Carcinoma in situ of breast3757
D06 Carcinoma in situ of cervix uteri4229
D07 Carcinoma in situ of other and unspecified genital organs958
D09 Carcinoma in situ of other and unspecified sites1842
D10-D36 Benign neoplasms-
D10 Benign neoplasm of mouth and pharynx1
D11 Benign neoplasm of major salivary glands49
D12 Benign neoplasm of colon, rectum, anus and anal canal5
D13 Benign neoplasm of other and ill-defined parts of digestive system1
D15 Benign neoplasm of other and unspecified intrathoracic organs3
D16 Benign neoplasm of bone and articular cartilage3
D18 Haemangioma and lymphangioma, any site6
D21 Other benign neoplasms of connective and other soft tissue5
D27 Benign neoplasm of ovary3
D29 Benign neoplasm of male genital organs1
D30 Benign neoplasm of urinary organs4
D32 Benign neoplasm of meninges831
D33 Benign neoplasm of brain and other parts of central nervous system388
D34 Benign neoplasm of thyroid gland5
D35 Benign neoplasm of other and unspecified endocrine glands370
D36 Benign neoplasm of other and unspecified sites4
D37-D48 Neoplasms of uncertain or unknown behaviour-
D37 Neoplasm of uncertain or unknown behaviour of oral cavity and digestive organs585
D38 Neoplasm of uncertain or unknown behaviour of middle ear and respiratory and intrathoracic organs38
D39 Neoplasm of uncertain or unknown behaviour of female genital organs197
D40 Neoplasm of uncertain or unknown behaviour of male genital organs41
D41 Neoplasm of uncertain or unknown behaviour of urinary organs1052
D42 Neoplasm of uncertain or unknown behaviour of meninges72
D43 Neoplasm of uncertain or unknown behaviour of brain and central nervous system117
D44 Neoplasm of uncertain or unknown behaviour of endocrine glands88
D45 Polycythaemia vera261
D46 Myelodysplastic syndromes408
D47 Other neoplasms of uncertain or unknown behaviour of lymphoid, haematopoietic and related tissue820
D48 Neoplasm of uncertain or unknown behaviour of other and unspecified sites420
O00-O08 Pregnancy with abortive outcome-
O01 Hydatidiform mole38""")
bar = {x[:3]: x for x in foo.split("\t") if not x[3]=="-"}

In [36]:
labels = pd.read_csv("fields.txt", header=None).merge(pd.read_csv("icd10_codes_mod.tsv", sep='\t',header=None, index_col=0), left_on=0, right_index=True)
labels[1] = labels[1].str.replace("Source of report of ","")
all_labels = ['Padding'] + ['Healthy'] + ['Female'] + ['Male'] + labels[1].to_list() + ["CXX Unknown Cancer"] + [bar[x] for x in cancers.cancer.cat.categories.to_list()] + ['Death']
len(all_labels)

1261

In [37]:
vocab_size

1261

In [38]:
np.savetxt('labels.csv', np.array(all_labels),fmt='%s')

In [39]:
{all_labels[i]: (data[:,2]==i).sum() for i in range(1130,len(all_labels))}

{'Q97 (other sex chromosome abnormalities, female phenotype, not elsewhere classified)': 68,
 'Q98 (other sex chromosome abnormalities, male phenotype, not elsewhere classified)': 62,
 'Q99 (other chromosome abnormalities, not elsewhere classified)': 6276,
 'CXX Unknown Cancer': 36,
 'C00 Malignant neoplasm of lip': 112,
 'C01 Malignant neoplasm of base of tongue': 186,
 'C02 Malignant neoplasm of other and unspecified parts of tongue': 43,
 'C03 Malignant neoplasm of gum': 50,
 'C04 Malignant neoplasm of floor of mouth': 55,
 'C05 Malignant neoplasm of palate': 58,
 'C06 Malignant neoplasm of other and unspecified parts of mouth': 87,
 'C07 Malignant neoplasm of parotid gland': 21,
 'C08 Malignant neoplasm of other and unspecified major salivary glands': 229,
 'C09 Malignant neoplasm of tonsil': 32,
 'C10 Malignant neoplasm of oropharynx': 32,
 'C11 Malignant neoplasm of nasopharynx': 19,
 'C12 Malignant neoplasm of pyriform sinus': 13,
 'C13 Malignant neoplasm of hypopharynx': 18,
 '

In [40]:
data

array([[     1,      0,      1],
       [     1,  17900,   1172],
       [     1,  19169,    808],
       ...,
       [502490,  26107,    637],
       [502490,  26107,    640],
       [502490,  27106,    670]], dtype=uint32)

In [41]:
data

array([[     1,      0,      1],
       [     1,  17900,   1172],
       [     1,  19169,    808],
       ...,
       [502490,  26107,    637],
       [502490,  26107,    640],
       [502490,  27106,    670]], dtype=uint32)

In [42]:
vocab_size

1261

In [43]:
np.isinf(data).sum()

0

In [44]:
data.max(0)

array([502490,  33790,   1259], dtype=uint32)

In [45]:
deaths

Unnamed: 0,patient,death,aod
0,47,1,25673
1,61,1,25206
2,69,1,24969
3,88,1,28852
4,105,1,23134
...,...,...,...
30198,502431,1,27769
30199,502435,1,29866
30200,502443,1,22688
30201,502457,1,28549


In [46]:
np.where(data[:,0]==47)

(array([477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489,
        490, 491, 492, 493, 494, 495, 496, 497, 530]),)

In [47]:
import re
foo = re.sub("[0-9]+\n","\t",
"""C00-C14 Malignant neoplasms of lip, oral cavity and pharynx1890
C15-C26 Malignant neoplasms of digestive organs15688
C30-C39 Malignant neoplasms of respiratory and intrathoracic organs6213
C40-C41 Malignant neoplasms of bone and articular cartilage171
C43-C44 Melanoma and other malignant neoplasms of skin46237
C45-C49 Malignant neoplasms of mesothelial and soft tissue1270
C50-C50 Malignant neoplasm of breast20813
C51-C58 Malignant neoplasms of female genital organs5785
C60-C63 Malignant neoplasms of male genital organs16765
C64-C68 Malignant neoplasms of urinary tract4691
C69-C72 Malignant neoplasms of eye, brain and other parts of central nervous system1348
C73-C75 Malignant neoplasms of thyroid and other endocrine glands1001
C76-C80 Malignant neoplasms of ill-defined, secondary and unspecified sites1053
C81-C96 Malignant neoplasms, stated or presumed to be primary, of lymphoid, haematopoietic and related tissue7816
C97-C97 Malignant neoplasms of independent (primary) multiple sites1
D00-D09 In situ neoplasms16152
D10-D36 Benign neoplasms1679
D37-D48 Neoplasms of uncertain or unknown behaviour4099
O00-O08 Pregnancy with abortive outcome""")

In [48]:
cancer_class = {k[0] + f"{i:02}": k for k in foo.split("\t") for i in range(int(k[1:3]), 1+int(k[5:7]))}
cancer_class

{'C00': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C01': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C02': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C03': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C04': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C05': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C06': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C07': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C08': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C09': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C10': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C11': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C12': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C13': 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx',
 'C14'

In [49]:
sum(train_val_split)/20

235432.95