In [1]:
import numpy as np
import pandas as pd

from baseline.torch.mydataset import CustomDataset
from baseline.torch.model import CNN_Text
from baseline.torch.train import train, predict_prob, run

import torch
from torchtext import data
import time
from tqdm import tqdm

# load data

In [2]:
data_root = './data/'

In [3]:
drugs = ['trametinib',
         'fulvestrant',
         'lovastatin',
         'abiraterone',
         'thalidomide',
         'sirolimus',
         'simvastatin',
         'methotrexate',
         'bortezomib',
         'gemcitabine',
         'tamoxifen',
         'dexamethasone',
         'doxorubicin']
len(drugs)

13

In [4]:
alllab_df = pd.read_csv(data_root+'lab_finfin.csv')
alllab_df.head()

Unnamed: 0,drug,file,lab,comment
0,tamoxifen,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3000794.xml,0,none
1,tamoxifen,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3005955.xml,0,none
2,tamoxifen,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3010527.xml,0,none
3,tamoxifen,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3011858.xml,0,none
4,tamoxifen,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3014261.xml,0,none


In [5]:
allfea_df = pd.read_csv(data_root+'fea_finfin.csv')
allfea_df.head()

Unnamed: 0,file,title,abstract
0,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3000794.xml,erk1 2 dependent vascular endothelial growth f...,background amp aims severe polycystic liver di...
1,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3001231.xml,peg functionalized magnetic nanoparticles for ...,purpose polyethylene glycol ( peg ) functional...
2,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3003872.xml,combination testing \( stage 2 \) of rapamycin...,purpose rapamycin demonstrated broad spectrum ...
3,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3004744.xml,durable responses with the metronomic regimen ...,background targeting the tumor microenvironmen...
4,PMC003XXXXXX.xml\PMC0030XXXXX\PMC3005850.xml,ph sensitive ionomeric particles obtained via ...,silk fibroin based biomaterials have been wide...


In [6]:
train_lab_df = alllab_df[alllab_df['drug'].isin(drugs[:8])].reset_index(drop=True)
test_lab_df = alllab_df[alllab_df['drug'].isin(drugs[8:])].reset_index(drop=True)

In [7]:
len_0 = len(train_lab_df[train_lab_df['lab']==0])
len_1 = len(train_lab_df[train_lab_df['lab']==1])
ratio = (len_0 - len_1)/len_1

In [8]:
train_lab_df_tmp = train_lab_df
for _ in range(int(ratio)):
    train_lab_df_tmp = pd.concat([train_lab_df_tmp, train_lab_df[train_lab_df['lab']==1]], ignore_index=True)
train_lab_df =  train_lab_df_tmp

In [9]:
len(train_lab_df[train_lab_df['lab']==0]), len(train_lab_df[train_lab_df['lab']==1])

(847, 828)

# Build Vocabulary

In [10]:
start_t = time.time()

text_field = data.Field(lower=True) # Text field
label_field = data.Field(sequential=False) # Label field

train_data, dev_data = CustomDataset.splits(text_field, label_field, train_lab_df, allfea_df, shuffle=True)
test_data = CustomDataset(text_field, label_field, test_lab_df, allfea_df)

end_t = time.time()-start_t
print("Time elapse (min): ", end_t/60)

  3%|▎         | 53/1675 [00:00<00:03, 528.09it/s]

preparing examples...


100%|██████████| 1675/1675 [00:02<00:00, 637.45it/s]
  4%|▎         | 99/2681 [00:00<00:05, 484.99it/s]

dev_index:  -167
preparing examples...


100%|██████████| 2681/2681 [00:04<00:00, 617.91it/s]

Time elapse (min):  0.1164683977762858





In [11]:
batch_size = 32
text_field.build_vocab(train_data, dev_data, test_data)
label_field.build_vocab(train_data, dev_data, test_data)
train_iter, dev_iter = data.Iterator.splits((train_data, dev_data), 
                                            batch_sizes=(batch_size, len(dev_data)))

# Run the Baseline Model

In [12]:
fields = [('text', text_field), ('label', label_field)]

In [13]:
model_dir_root = './trained_models/'

In [14]:
run(CNN_Text, model_dir_root+'model_baseline_fin.pkl', drugs[8:], alllab_df, allfea_df, fields)

  5%|▌         | 19/371 [00:00<00:01, 189.94it/s]

########################
#### drug bortezomib


100%|██████████| 371/371 [00:01<00:00, 233.22it/s]
  6%|▌         | 25/415 [00:00<00:01, 244.98it/s]

The first paper is PMC004XXXXXX.xml\PMC0042XXXXX\PMC4266584.xml
Number of papers be read of drug [bortezomib]: 119
########################
#### drug gemcitabine


100%|██████████| 415/415 [00:01<00:00, 233.58it/s]
  5%|▍         | 25/526 [00:00<00:02, 247.71it/s]

The first paper is PMC004XXXXXX.xml\PMC0048XXXXX\PMC4873426.xml
Number of papers be read of drug [gemcitabine]: 5
########################
#### drug tamoxifen


100%|██████████| 526/526 [00:02<00:00, 251.51it/s]
  5%|▍         | 28/565 [00:00<00:01, 273.84it/s]

The first paper is PMC003XXXXXX.xml\PMC0037XXXXX\PMC3711713.xml
Number of papers be read of drug [tamoxifen]: 46
########################
#### drug dexamethasone


100%|██████████| 565/565 [00:02<00:00, 268.36it/s]
  0%|          | 0/804 [00:00<?, ?it/s]

The first paper is PMC004XXXXXX.xml\PMC0044XXXXX\PMC4422178.xml
Number of papers be read of drug [dexamethasone]: 141
########################
#### drug doxorubicin


100%|██████████| 804/804 [00:03<00:00, 267.08it/s]


The first paper is PMC003XXXXXX.xml\PMC0032XXXXX\PMC3298037.xml
Number of papers be read of drug [doxorubicin]: 384


({'bortezomib': 119,
  'gemcitabine': 5,
  'tamoxifen': 46,
  'dexamethasone': 141,
  'doxorubicin': 384},
 {'bortezomib': 'PMC004XXXXXX.xml\\PMC0042XXXXX\\PMC4266584.xml',
  'gemcitabine': 'PMC004XXXXXX.xml\\PMC0048XXXXX\\PMC4873426.xml',
  'tamoxifen': 'PMC003XXXXXX.xml\\PMC0037XXXXX\\PMC3711713.xml',
  'dexamethasone': 'PMC004XXXXXX.xml\\PMC0044XXXXX\\PMC4422178.xml',
  'doxorubicin': 'PMC003XXXXXX.xml\\PMC0032XXXXX\\PMC3298037.xml'})