In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

In [2]:
data = pd.read_csv('../../data/ml/inclusion_dataset_cleaned.csv')
data = data.drop(['Unnamed: 10'], axis=1)
print(data.shape)
print(data.columns)

train_df = data[data.is_train == 1]
test_df = data[data.is_train == 0]

train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(train_df.shape, test_df.shape)

(300, 10)
Index(['sf', 'cui', 'lf', 'lf_base', 'source', 'semtypes', 'semgroups',
       'is_train', 'is_accurate', 'is_clinical_relevant'],
      dtype='object')
(200, 10) (100, 10)


In [3]:
task = "is_accurate"
# task = "is_clinical_relevant"

In [4]:
neg_examples = data[data[task] == 0].shape[0]
pos_examples = data[data[task] == 1].shape[0]

print('Negative Examples={}, Positive Examples={}'.format(neg_examples, pos_examples))

Negative Examples=78, Positive Examples=222


In [5]:
# Generate features
X_str = []
y_tr = []
for _, row in train_df.iterrows():
    row = row.to_dict()
    x = {}
    sem_groups = row['semgroups'].split('|')
    for sem_group in sem_groups:
        x[sem_group] = 1.0
    lfs = row['lf'].split('|')
    x['support'] = len(lfs)
    y_tr.append(row[task])
    
    sem_types = row['semtypes'].split('|')
    for sem_type in sem_types:
        x[sem_type] = 1.0
    X_str.append(x)

X_ste = []
y_te = []
for _, row in test_df.iterrows():
    row = row.to_dict()
    x = {}
    sem_groups = row['semgroups'].split('|')
    for sem_group in sem_groups:
        x[sem_group] = 1.0
    lfs = row['lf'].split('|')
    x['support'] = len(lfs)
    y_te.append(row[task])
    
    sem_types = row['semtypes'].split('|')
    for sem_type in sem_types:
        x[sem_type] = 1.0
    X_ste.append(x)

In [6]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_tr = dv.fit_transform(X_str, y_tr)
X_te = dv.transform(X_ste)

In [7]:
if task == "is_accurate": # Not using these features gave more score for clinically relevant task
    # Any other features
    import editdistance as ed

    num_features = 5

    print(X_tr.shape, X_te.shape)

    new_f = np.zeros((X_tr.shape[0],num_features))
    X_tr = np.hstack((X_tr, new_f))
    new_f = np.zeros((X_te.shape[0],num_features))
    X_te = np.hstack((X_te, new_f))
    for X, df in zip([X_tr, X_te], [train_df, test_df]):
        for index, row in df.iterrows():
            sf = row['sf']
            lf = row['lf']
            lf_base = row['lf_base']
            sflf = ''.join([x[0] for x in lf.split(' ') if x])
            sflf_base = ''.join([x[0] for x in lf_base.split(' ') if x])
            X[index, 100] = ed.eval(sf.lower(), sflf.lower())
            X[index, 101] = ed.eval(sf.lower(), sflf.lower())
            X[index, 102] = len(lf_base)
            X[index, 103] = len([x for x in lf.split(' ') if x in STOPWORDS])
            X[index, 104] = 1 if row['source'] == "pubmed" else 0

    print(X_tr.shape, X_te.shape)

(200, 100) (100, 100)
(200, 105) (100, 105)


In [8]:
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedShuffleSplit, KFold

from sklearn.metrics import classification_report, roc_auc_score


USE_RF = True
if USE_RF:
    estimator = RandomForestClassifier(n_estimators=200)
else:
    estimator = LogisticRegression(solver='liblinear')
    

sss = StratifiedShuffleSplit(n_splits=99, test_size=0.2)

y_tests = []
for train_index, test_index in sss.split(X_tr, y_tr):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    y_train, y_test = np.array(y_tr)[train_index], np.array(y_tr)[test_index]
    auc = []
    estimator.fit(X_train, y_train)
    y_proba = estimator.predict_proba(X_test)[:, 1]
    y_tests.append(estimator.predict_proba(X_te)[:, 1])
    score = roc_auc_score(y_test, y_proba)
    auc.append(score)
print(np.mean(auc))

0.8338557993730408


In [9]:
y_proba = np.mean(np.vstack(y_tests), axis=0)
y_pred = (y_proba >= 0.5).astype(int)

In [10]:
roc_auc = roc_auc_score(y_te, y_proba)
print('ROC AUC={}'.format(roc_auc))

cr = classification_report(y_te, y_pred)
print(cr)

ROC AUC=0.8068887634105026
              precision    recall  f1-score   support

           0       1.00      0.61      0.76        23
           1       0.90      1.00      0.94        77

    accuracy                           0.91       100
   macro avg       0.95      0.80      0.85       100
weighted avg       0.92      0.91      0.90       100



In [11]:
mistakes = [i for i in range(100) if y_pred[i] != y_te[i]]

In [12]:
test_df.loc[mistakes, :]

Unnamed: 0,sf,cui,lf,lf_base,source,semtypes,semgroups,is_train,is_accurate,is_clinical_relevant
19,TERC,C1148756,telomerase activity,telomerase activity,pubmed,Genetic Function,Physiology,0,0,0
25,PRR,C0871208|C1514918|C1521828|C4554488,pup retrieval rate,pup retrieval rate,pubmed,Activity|Health Care Activity|Quantitative Con...,Activities & Behaviors|Concepts & Ideas|Proced...,0,0,0
45,TT,C0003320,t antigens,antigen,pubmed,Immunologic Factor,Chemicals & Drugs,0,0,0
48,SLY,C0221205|C0457802|C0982164|C1138844|C3151529,strawberry lethal yellows,strawberry lethal yellow,pubmed,Finding|Food|Plant|Qualitative Concept,Concepts & Ideas|Disorders|Living Beings|Objects,0,0,0
69,SNL,C0023690|C0037925,spinal cord ligation,spinal cord ligation,pubmed,"Body Part, Organ, or Organ Component|Therapeut...",Anatomy|Procedures,0,0,0
70,PEC,C0014122|C0022067|C0439453|C0450254|C1881173|C...,pathogenic ie colii,pathogenic ie colii,pubmed,Disease or Syndrome|Finding|Geographic Area|In...,Concepts & Ideas|Disorders|Geographic Areas|Li...,0,0,1
75,EE,C0085584|C1963101|C4554030,encephalopathy|encephalopathies,encephalopathy,pubmed,Disease or Syndrome|Finding,Disorders,0,0,1
93,HOLD,C1553387|C1720083|C1948035|C3853841,hold dosing instruction fragment,hold dosing instruction fragment,umls,Activity|Functional Concept|Intellectual Product,Activities & Behaviors|Concepts & Ideas,0,0,0
97,PA,C1418892,prh1 gene,prh1 gene,umls,Gene or Genome,Genes & Molecular Sequences,0,0,0
