In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

In [2]:
data = pd.read_csv('../../data/ml/inclusion_dataset_cleaned.csv')
data = data.drop(['Unnamed: 10'], axis=1)
print(data.shape)
print(data.columns)

train_df = data[data.is_train == 1]
test_df = data[data.is_train == 0]

train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(train_df.shape, test_df.shape)

(305, 10)
Index(['sf', 'cui', 'lf', 'lf_base', 'source', 'semtypes', 'semgroups',
       'is_train', 'is_accurate', 'is_clinical_relevant'],
      dtype='object')
(202, 10) (103, 10)


In [3]:
task1 = "is_accurate"
task2 = "is_clinical_relevant"

In [4]:
neg_examples = data[(data[task1] == 0) | (data[task2] == 0)].shape[0]
pos_examples = data[(data[task1] == 1) & (data[task2] == 1)].shape[0]

print('Negative Examples={}, Positive Examples={}'.format(neg_examples, pos_examples))

Negative Examples=192, Positive Examples=113


In [5]:
# Generate features
X_str = []
y_tr = []
for _, row in train_df.iterrows():
    row = row.to_dict()
    x = {}
    sem_groups = row['semgroups'].split('|')
    for sem_group in sem_groups:
        x[sem_group] = 1.0
    lfs = row['lf'].split('|')
    x['support'] = len(lfs)
    y_tr.append(1 if row[task1] == 1 and row[task2] == 1 else 0)
    
    sem_types = row['semtypes'].split('|')
    for sem_type in sem_types:
        x[sem_type] = 1.0
    X_str.append(x)

X_ste = []
y_te = []
for _, row in test_df.iterrows():
    row = row.to_dict()
    x = {}
    sem_groups = row['semgroups'].split('|')
    for sem_group in sem_groups:
        x[sem_group] = 1.0
    lfs = row['lf'].split('|')
    x['support'] = len(lfs)
    y_te.append(1 if row[task1] == 1 and row[task2] == 1 else 0)
    
    sem_types = row['semtypes'].split('|')
    for sem_type in sem_types:
        x[sem_type] = 1.0
    X_ste.append(x)

In [6]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_tr = dv.fit_transform(X_str, y_tr)
X_te = dv.transform(X_ste)

In [7]:
# Any other features
import editdistance as ed

num_features = 5

print(X_tr.shape, X_te.shape)

new_f = np.zeros((X_tr.shape[0],num_features))
X_tr = np.hstack((X_tr, new_f))
new_f = np.zeros((X_te.shape[0],num_features))
X_te = np.hstack((X_te, new_f))
for X, df in zip([X_tr, X_te], [train_df, test_df]):
    for index, row in df.iterrows():
        sf = row['sf']
        lf = row['lf']
        lf_base = row['lf_base']
        sflf = ''.join([x[0] for x in lf.split(' ') if x])
        sflf_base = ''.join([x[0] for x in lf_base.split(' ') if x])
        X[index, 100] = ed.eval(sf.lower(), sflf.lower())
        X[index, 101] = ed.eval(sf.lower(), sflf_base.lower())
        X[index, 102] = len(lf_base)
        X[index, 103] = len([x for x in lf.split(' ') if x in STOPWORDS])
        X[index, 104] = 1 if row['source'] == "pubmed" else 0

print(X_tr.shape, X_te.shape)

(202, 100) (103, 100)
(202, 105) (103, 105)


In [8]:
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedShuffleSplit, KFold

from sklearn.metrics import classification_report, roc_auc_score


USE_RF = True
if USE_RF:
    estimator = RandomForestClassifier(n_estimators=200)
else:
    estimator = LogisticRegression(solver='liblinear')
    

sss = StratifiedShuffleSplit(n_splits=99, test_size=0.2)

y_tests = []
for train_index, test_index in sss.split(X_tr, y_tr):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    y_train, y_test = np.array(y_tr)[train_index], np.array(y_tr)[test_index]
    auc = []
    estimator.fit(X_train, y_train)
    y_proba = estimator.predict_proba(X_test)[:, 1]
    y_tests.append(estimator.predict_proba(X_te)[:, 1])
    score = roc_auc_score(y_test, y_proba)
    auc.append(score)
print(np.mean(auc))

0.7817460317460319


In [9]:
y_proba = np.mean(np.vstack(y_tests), axis=0)
y_pred = (y_proba >= 0.5).astype(int)

In [10]:
roc_auc = roc_auc_score(y_te, y_proba)
print('ROC AUC={}'.format(roc_auc))

cr = classification_report(y_te, y_pred)
print(cr)

ROC AUC=0.7682170542635658
              precision    recall  f1-score   support

           0       0.72      0.88      0.79        60
           1       0.76      0.51      0.61        43

    accuracy                           0.73       103
   macro avg       0.74      0.70      0.70       103
weighted avg       0.73      0.73      0.72       103



In [11]:
# Train on all data
estimator.fit(X_tr, y_tr)

y_proba = estimator.predict_proba(X_te)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_te, y_proba)
print('ROC AUC={}'.format(roc_auc))

cr = classification_report(y_te, y_pred)
print(cr)

ROC AUC=0.7672480620155039
              precision    recall  f1-score   support

           0       0.72      0.87      0.79        60
           1       0.74      0.53      0.62        43

    accuracy                           0.73       103
   macro avg       0.73      0.70      0.70       103
weighted avg       0.73      0.73      0.72       103



In [12]:
# Save the model and the transformer
from joblib import dump, load

PATH = '../../data/'

dump(estimator, PATH + 'inclusion_estimator.joblib')
dump(dv, PATH + 'inclusion_transformer.joblib')

['../../data/inclusion_transformer.joblib']

In [13]:
mistakes = [i for i in range(len(y_te)) if y_pred[i] != y_te[i]]

In [14]:
test_df.loc[mistakes, :]

Unnamed: 0,sf,cui,lf,lf_base,source,semtypes,semgroups,is_train,is_accurate,is_clinical_relevant
1,RAISE,C0220908|C0220909|C0456962|C1698960|C1710031|C...,rapid indexbased screening engine,rapid indexbased screening engine,pubmed,Diagnostic Procedure|Functional Concept|Health...,Concepts & Ideas|Procedures,0,1,0
4,HDCT,C0007673|C0444956|C1627358|C2349975|C2985765|C...,high dose enhancement ct,high dose enhancement ct,pubmed,"Activity|Amino Acid, Peptide, or Protein|Gene ...",Activities & Behaviors|Chemicals & Drugs|Conce...,0,1,1
5,PCE,C0419121|C1511572,passive cycle exercise,passive cycle exercise,pubmed,Temporal Concept|Therapeutic or Preventive Pro...,Concepts & Ideas|Procedures,0,1,0
7,TCT,C0087111|C1292734|C1522326,treated with 60coteletherapy,treat 60coteletherapy,pubmed,Functional Concept|Therapeutic or Preventive P...,Concepts & Ideas|Procedures,0,1,1
9,VDA,C0597667,vitamin d analogues,vitamin analogue,pubmed,Organic Chemical|Vitamin,Chemicals & Drugs,0,1,1
21,LSH,C0920283,lymphocytespecific helicase,lymphocytespecific helicase,pubmed,"Amino Acid, Peptide, or Protein|Enzyme",Chemicals & Drugs,0,1,0
22,PH,C0018787|C0039005|C3665571|C4554819,porcine heart,porcine heart,pubmed,"Body Part, Organ, or Organ Component|Finding|M...",Anatomy|Disorders|Living Beings,0,1,0
23,PMD,C0011570|C0011581|C0344315|C0460137|C4049644|C...,premovement emg depression,premovement emg depression,pubmed,Functional Concept|Intellectual Product|Mental...,Concepts & Ideas|Disorders,0,1,1
26,EFD,C0001948|C0205435|C0684271|C1279901|C1279919,early first drinking,early first drinking,pubmed,Individual Behavior|Organism Function|Qualitat...,Activities & Behaviors|Concepts & Ideas|Physio...,0,1,1
29,CBT,C3880387,captive bubble tensiometer,captive bubble tensiometer,pubmed,Medical Device,Devices,0,1,0
