In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

Данные как они есть в csv:

In [None]:
df_raw = "./data/labeledEligibilitySample1000000.csv"
df = pd.read_table(df_raw, header=None)
df.head()

Переименуем колонки для понятности:

In [None]:
df.rename({0: "label", 1: "description"}, axis="columns", inplace=True)
df.head()

Основные данные:

- у колонки с label уберём лишнее и пометим её как категорию,

- колонку с описанием разделим на interventions и conditions,

- interventions пометим как категории.

In [None]:
def clean_label(label):
    return label[-1]

def extract_interventions(full_description):
    interventions = full_description.split(" . ")[0]
    interventions = interventions.replace("study interventions are ", "")
    return interventions

def extract_conditions(full_description):
    return full_description.split(" . ")[1]

In [None]:
df["label"] = df["label"].apply(clean_label).astype("int")
df["interventions"] = df["description"].apply(extract_interventions).astype("category")
df["conditions"] = df["description"].apply(extract_conditions)
df.head()

Все диагнозы положим в отдельную колонку:

In [None]:
def extract_diagnosis(condition):
    if "diagnosis" in condition:
        return condition[0:condition.find("diagnosis")-1]

In [None]:
df["diagnosis"] = df["conditions"].apply(extract_diagnosis)
df.head()

Обработаем `conditions`: оставим только значащие слова и обработаем их стеммером.

In [None]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

stops = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [None]:
def parse_conditions(conditions_raw):
    conditions_norm = []
    non_stops = [token for token in word_tokenize(conditions_raw)
                if token not in stops]
    for token in non_stops:
        conditions_norm.append(stemmer.stem(token))
    return " ".join(conditions_norm)

In [None]:
%%time
df["conditions parsed"] = df["conditions"].apply(parse_conditions)

In [None]:
df.head()

## Случайные решения

In [2]:
import random
random.seed(1968)

from sklearn.metrics import roc_auc_score

In [5]:
df["random label"] = [random.choice([0, 1]) for i in range(len(df))]

In [6]:
roc_auc_score(df["random label"], df["label"])

0.4998069999982946

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
interventions_encoder = LabelEncoder()
df["interventions"] = interventions_encoder.fit_transform(df["interventions"])

In [None]:
diagnosis_encoder = LabelEncoder()
df["diagnosis"].fillna("no diagnosis", inplace=True)
df["diagnosis"] = diagnosis_encoder.fit_transform(df["diagnosis"])
df.head()

In [4]:
# df.to_csv("./data/labeledEligibilitySample_parsed.csv", sep=";", index=False)

df = pd.read_csv("./data/labeledEligibilitySample_parsed.csv", sep=";")
df.head()

Unnamed: 0,label,description,interventions,conditions,diagnosis,conditions parsed,random label
0,0,study interventions are recombinant CD40-ligan...,14650,melanoma skin diagnosis and no active cns meta...,4672,melanoma skin diagnosi activ cns metastas ct s...,0
1,0,study interventions are Liposomal doxorubicin ...,6744,colorectal cancer diagnosis and cardiovascular,1743,colorect cancer diagnosi cardiovascular,0
2,0,study interventions are BI 836909 . multiple m...,1433,multiple myeloma diagnosis and indwelling cent...,5033,multipl myeloma diagnosi indwel central venous...,1
3,0,study interventions are Immunoglobulins . recu...,5912,recurrent fallopian tube carcinoma diagnosis a...,6949,recurr fallopian tube carcinoma diagnosi patie...,0
4,0,study interventions are Paclitaxel . stage ova...,8694,stage ovarian cancer diagnosis and patients mu...,8630,stage ovarian cancer diagnosi patient must rec...,0


## TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(df["conditions parsed"])

In [9]:
X = vectorizer.transform(df["conditions parsed"])
# X = np.hstack((X, df["interventions"].as_matrix(), df["diagnosis"].as_matrix()))
y = df["label"].as_matrix()

## Log reg

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

In [11]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.2, random_state=1968)
X_train, X_valid, y_train, y_valid = train_test_split(X_tr, y_tr, test_size=0.33, random_state=1968)

In [None]:
# logreg_params = {
#     "C": np.logspace(0.1, 100, 10)
# }

# model = GridSearchCV(
#         estimator=LogisticRegression(),
#         param_grid=logreg_params,
#         scoring="roc_auc",
#         cv=KFold(n_splits=7),
#         verbose=1,
#         n_jobs=-1
#     )
# model.fit(X_train, y_train)
# print("Score = {}\nParams = {}".format(model.best_score_, model.best_params_))

In [16]:
c_options = np.logspace(0.01, 10, 11)
models = []
for c in c_options:
    model = LogisticRegression(C=c)
    model.fit(X_train, y_train)
    roc_score = roc_auc_score(model.predict(X_valid), y_valid)
    print("C={}, validation roc-auc={}".format(c, roc_score))
    models.append(model)

C=1.023292992280754, validation roc-auc=0.842055303776486
C=10.209394837076797, validation roc-auc=0.8463415308096158
C=101.85913880541169, validation roc-auc=0.8469089739315917
C=1016.2486928706949, validation roc-auc=0.846470026057543
C=10139.1138573668, validation roc-auc=0.8462386396061242
C=101157.94542598983, validation roc-auc=0.8462736434247633
C=1009252.8860766834, validation roc-auc=0.8462769325384691
C=10069316.688518044, validation roc-auc=0.8462992691349545
C=100461579.02783968, validation roc-auc=0.8461389225012287
C=1002305238.0778984, validation roc-auc=0.8462804222177985
C=10000000000.0, validation roc-auc=0.8462495092322541


Лучшее значение на валидации — `C=101.85913880541169, validation roc-auc=0.8469089739315917` (хотя они все примерно одинаковые).

In [18]:
best_model = models[2]
y_pred = best_model.predict(X_test)
roc_auc_score(y_pred, y_test)

0.8468988349512006