**Import required packages**

In [4]:
import numpy as np 
import pandas as pd
import random

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer

**Loading dataset**

In [6]:
data = pd.read_table('../input/labeledEligibilitySample1000000.csv', header=None)

In [7]:
data.head()

Unnamed: 0,0,1
0,__label__0,study interventions are recombinant CD40-ligan...
1,__label__0,study interventions are Liposomal doxorubicin ...
2,__label__0,study interventions are BI 836909 . multiple m...
3,__label__0,study interventions are Immunoglobulins . recu...
4,__label__0,study interventions are Paclitaxel . stage ova...


In [8]:
df = pd.DataFrame(np.array(data).reshape(1000000,2), columns=['label', 'describe'])

In [9]:
df = df.replace('__label__0', 0)
df = df.replace('__label__1', 1)

In [10]:
df['describe'] = df['describe'].str.replace('study interventions are ', '', regex=True)

In [11]:
df.sample(5)

Unnamed: 0,label,describe
300221,0,Decitabine . recurrent neuroblastoma diagnosis...
973001,1,Dexamethasone 21-phosphate . sarcoma diagnosis...
538774,1,Pegaspargase . nasal and nasal type nk cell ly...
125530,0,Aldesleukin . malignant melanoma diagnosis and...
465768,0,Doxorubicin . stage iia hodgkin lymphoma diagn...


In [12]:
y = df['label']
X = df[['describe']]
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #test for final validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

Transforming describe feature to tf-idf matrix:

In [13]:
vectorizer = TfidfVectorizer(max_features=5000, min_df=20, max_df=0.7)

X = vectorizer.fit_transform(X['describe']) #train+valid
X_train = vectorizer.transform(X_train['describe'])
X_valid = vectorizer.transform(X_valid['describe'])
X_test = vectorizer.transform(X_test['describe'])

**Random**

In [14]:
y_random = [random.randint(0,1) for i in range(len(y_test))]

In [15]:
roc_auc_score(y_test, y_random)

0.5015148050256002

**Logistic regression**

In [16]:
C_param_range = [0.1,1,10,100,1000]

In [17]:
for i in C_param_range:
    
    logreg = LogisticRegression(solver='liblinear', penalty='l2', C=i, random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_valid)
    
    print(i, roc_auc_score(y_valid, y_pred))

0.1 0.8581984462578593
1 0.8678419816753116
10 0.8691044564907783
100 0.8693734429460414
1000 0.8693296284722072


**Decision Tree**

In [18]:
dt = DecisionTreeClassifier(max_depth=40)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_valid)

In [19]:
roc_auc_score(y_valid, y_pred)

0.8297466238255526

**Random Forest**

In [20]:
rf = RandomForestClassifier(n_estimators=10, random_state=42, oob_score=True)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [21]:
y_pred = rf.predict(X_valid)
roc_auc_score(y_valid, y_pred)

0.9307557508650229

In [22]:
logreg = LogisticRegression(solver='liblinear', penalty='l2', C=100, random_state=42)
logreg.fit(X_train, y_train)

y_pred1 = logreg.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = rf.predict(X_test)

Final results (ROC AUC):

In [25]:
print('Random:' + str(roc_auc_score(y_test, y_random)))
print('Logistic regression:' + str(roc_auc_score(y_test, y_pred1)))
print('Decision Tree:' + str(roc_auc_score(y_test, y_pred2)))
print('Random Forest:' + str(roc_auc_score(y_test, y_pred3)))

Random:0.5015148050256002
Logistic regression:0.8690749867373673
Decision Tree:0.8282814023479557
Random Forest:0.9303412273227667
