In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

Данные как они есть в csv:

In [2]:
df_raw = "./data/labeledEligibilitySample1000000.csv"
df = pd.read_table(df_raw, header=None)
df.head()

Unnamed: 0,0,1
0,__label__0,study interventions are recombinant CD40-ligan...
1,__label__0,study interventions are Liposomal doxorubicin ...
2,__label__0,study interventions are BI 836909 . multiple m...
3,__label__0,study interventions are Immunoglobulins . recu...
4,__label__0,study interventions are Paclitaxel . stage ova...


Переименуем колонки для понятности:

In [3]:
df.rename({0: "label", 1: "description"}, axis="columns", inplace=True)
df.head()

Unnamed: 0,label,description
0,__label__0,study interventions are recombinant CD40-ligan...
1,__label__0,study interventions are Liposomal doxorubicin ...
2,__label__0,study interventions are BI 836909 . multiple m...
3,__label__0,study interventions are Immunoglobulins . recu...
4,__label__0,study interventions are Paclitaxel . stage ova...


Основные данные:

- у колонки с label уберём лишнее и пометим её как категорию,

- колонку с описанием разделим на interventions и conditions,

- interventions пометим как категории.

In [4]:
def clean_label(label):
    return label[-1]

def extract_interventions(full_description):
    interventions = full_description.split(" . ")[0]
    interventions = interventions.replace("study interventions are ", "")
    return interventions

def extract_conditions(full_description):
    return full_description.split(" . ")[1]

In [5]:
df["label"] = df["label"].apply(clean_label).astype("int")
df["interventions"] = df["description"].apply(extract_interventions).astype("category")
df["conditions"] = df["description"].apply(extract_conditions)
df.head()

Unnamed: 0,label,description,interventions,conditions
0,0,study interventions are recombinant CD40-ligan...,recombinant CD40-ligand,melanoma skin diagnosis and no active cns meta...
1,0,study interventions are Liposomal doxorubicin ...,Liposomal doxorubicin,colorectal cancer diagnosis and cardiovascular
2,0,study interventions are BI 836909 . multiple m...,BI 836909,multiple myeloma diagnosis and indwelling cent...
3,0,study interventions are Immunoglobulins . recu...,Immunoglobulins,recurrent fallopian tube carcinoma diagnosis a...
4,0,study interventions are Paclitaxel . stage ova...,Paclitaxel,stage ovarian cancer diagnosis and patients mu...


Все диагнозы положим в отдельную колонку:

In [6]:
def extract_diagnosis(condition):
    if "diagnosis" in condition:
        return condition[0:condition.find("diagnosis")-1]

In [7]:
df["diagnosis"] = df["conditions"].apply(extract_diagnosis)
df.head()

Unnamed: 0,label,description,interventions,conditions,diagnosis
0,0,study interventions are recombinant CD40-ligan...,recombinant CD40-ligand,melanoma skin diagnosis and no active cns meta...,melanoma skin
1,0,study interventions are Liposomal doxorubicin ...,Liposomal doxorubicin,colorectal cancer diagnosis and cardiovascular,colorectal cancer
2,0,study interventions are BI 836909 . multiple m...,BI 836909,multiple myeloma diagnosis and indwelling cent...,multiple myeloma
3,0,study interventions are Immunoglobulins . recu...,Immunoglobulins,recurrent fallopian tube carcinoma diagnosis a...,recurrent fallopian tube carcinoma
4,0,study interventions are Paclitaxel . stage ova...,Paclitaxel,stage ovarian cancer diagnosis and patients mu...,stage ovarian cancer


Обработаем `conditions`: оставим только значащие слова и обработаем их стеммером.

In [8]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

stops = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [9]:
def parse_conditions(conditions_raw):
    conditions_norm = []
    non_stops = [token for token in word_tokenize(conditions_raw)
                if token not in stops]
    for token in non_stops:
        conditions_norm.append(stemmer.stem(token))
    return " ".join(conditions_norm)

In [10]:
%%time
df["conditions parsed"] = df["conditions"].apply(parse_conditions)

CPU times: user 10min 12s, sys: 4.47 s, total: 10min 16s
Wall time: 10min 30s


In [11]:
df.head()

Unnamed: 0,label,description,interventions,conditions,diagnosis,conditions parsed
0,0,study interventions are recombinant CD40-ligan...,recombinant CD40-ligand,melanoma skin diagnosis and no active cns meta...,melanoma skin,melanoma skin diagnosi activ cns metastas ct s...
1,0,study interventions are Liposomal doxorubicin ...,Liposomal doxorubicin,colorectal cancer diagnosis and cardiovascular,colorectal cancer,colorect cancer diagnosi cardiovascular
2,0,study interventions are BI 836909 . multiple m...,BI 836909,multiple myeloma diagnosis and indwelling cent...,multiple myeloma,multipl myeloma diagnosi indwel central venous...
3,0,study interventions are Immunoglobulins . recu...,Immunoglobulins,recurrent fallopian tube carcinoma diagnosis a...,recurrent fallopian tube carcinoma,recurr fallopian tube carcinoma diagnosi patie...
4,0,study interventions are Paclitaxel . stage ova...,Paclitaxel,stage ovarian cancer diagnosis and patients mu...,stage ovarian cancer,stage ovarian cancer diagnosi patient must rec...


## Случайные решения

In [19]:
import random
random.seed(1968)

from sklearn.metrics import roc_auc_score

In [13]:
df["random label"] = [random.choice([0, 1]) for i in range(len(df))]

In [14]:
roc_auc_score(df["random label"], df["label"])

0.4998069999982946

## Label encoding

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
interventions_encoder = LabelEncoder()
df["interventions"] = interventions_encoder.fit_transform(df["interventions"])

In [17]:
diagnosis_encoder = LabelEncoder()
df["diagnosis"].fillna("no diagnosis", inplace=True)
df["diagnosis"] = diagnosis_encoder.fit_transform(df["diagnosis"])

In [18]:
df.head()

Unnamed: 0,label,description,interventions,conditions,diagnosis,conditions parsed,random label
0,0,study interventions are recombinant CD40-ligan...,14650,melanoma skin diagnosis and no active cns meta...,4672,melanoma skin diagnosi activ cns metastas ct s...,0
1,0,study interventions are Liposomal doxorubicin ...,6744,colorectal cancer diagnosis and cardiovascular,1743,colorect cancer diagnosi cardiovascular,0
2,0,study interventions are BI 836909 . multiple m...,1433,multiple myeloma diagnosis and indwelling cent...,5033,multipl myeloma diagnosi indwel central venous...,1
3,0,study interventions are Immunoglobulins . recu...,5912,recurrent fallopian tube carcinoma diagnosis a...,6949,recurr fallopian tube carcinoma diagnosi patie...,0
4,0,study interventions are Paclitaxel . stage ova...,8694,stage ovarian cancer diagnosis and patients mu...,8630,stage ovarian cancer diagnosi patient must rec...,0


In [19]:
# df.to_csv("./data/labeledEligibilitySample_parsed.csv", sep=";", index=False)

In [2]:
df = pd.read_csv("./data/labeledEligibilitySample_parsed.csv", sep=";")

In [3]:
df.head()

Unnamed: 0,label,description,interventions,conditions,diagnosis,conditions parsed,random label
0,0,study interventions are recombinant CD40-ligan...,14650,melanoma skin diagnosis and no active cns meta...,4672,melanoma skin diagnosi activ cns metastas ct s...,0
1,0,study interventions are Liposomal doxorubicin ...,6744,colorectal cancer diagnosis and cardiovascular,1743,colorect cancer diagnosi cardiovascular,0
2,0,study interventions are BI 836909 . multiple m...,1433,multiple myeloma diagnosis and indwelling cent...,5033,multipl myeloma diagnosi indwel central venous...,1
3,0,study interventions are Immunoglobulins . recu...,5912,recurrent fallopian tube carcinoma diagnosis a...,6949,recurr fallopian tube carcinoma diagnosi patie...,0
4,0,study interventions are Paclitaxel . stage ova...,8694,stage ovarian cancer diagnosis and patients mu...,8630,stage ovarian cancer diagnosi patient must rec...,0


## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(df["conditions parsed"])

In [15]:
X = vectorizer.transform(df["conditions parsed"])
# X = np.hstack((X, df["interventions"].as_matrix(), df["diagnosis"].as_matrix()))
y = df["label"].as_matrix()

## Log reg

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1968)

In [None]:
# logreg_params = {
#     "C": np.logspace(0.1, 100, 100)
# }

# model = GridSearchCV(
#         estimator=LogisticRegression(),
#         param_grid=logreg_params,
#         scoring="roc_auc",
#         cv=KFold(n_splits=7),
#         verbose=1,
#         n_jobs=-1
#     )
# model.fit(X_train, y_train)
# print("Score = {}\nParams = {}".format(model.best_score_, model.best_params_))

In [20]:
for c_option in np.linspace(0.01, 100, 11):
    model = LogisticRegression(C=c_option)
    model.fit(X_train, y_train)
    roc_score = roc_auc_score(model.predict(X_test), y_test)
    print("C={}, roc-auc={}".format(c_option, roc_score))

C=0.01, roc-auc=0.8127548362894019
C=10.008999999999999, roc-auc=0.8484176888832282
C=20.008, roc-auc=0.8484108224488114
C=30.006999999999998, roc-auc=0.8485220280564185
C=40.00599999999999, roc-auc=0.8485791726315419
C=50.00499999999999, roc-auc=0.8485831806972234
C=60.00399999999999, roc-auc=0.8485934678313858
C=70.003, roc-auc=0.8486629257800645
C=80.002, roc-auc=0.8486580656463057
C=90.00099999999999, roc-auc=0.8486340501375841
C=100.0, roc-auc=0.848614612128752
