In [1]:
import numpy as np
import os

DATA_PATH = 'drugsCom_raw'

In [11]:
import pandas as pd

def load_csv(path = DATA_PATH):
    csv_path = os.path.join(path, 'drugsComTrain_raw.tsv')
    return pd.read_csv(csv_path,delimiter='\t',encoding='utf-8')

In [12]:
df = load_csv()

In [13]:
df.loc[:,'drugName'].value_counts()

Levonorgestrel                                   3657
Etonogestrel                                     3336
Ethinyl estradiol / norethindrone                2850
Nexplanon                                        2156
Ethinyl estradiol / norgestimate                 2117
Ethinyl estradiol / levonorgestrel               1888
Phentermine                                      1543
Sertraline                                       1360
Escitalopram                                     1292
Mirena                                           1242
Implanon                                         1102
Gabapentin                                       1047
Bupropion                                        1022
Venlafaxine                                      1016
Miconazole                                       1000
Citalopram                                        995
Medroxyprogesterone                               995
Lexapro                                           952
Bupropion / naltrexone      

In [53]:
# Isolating data and species
X, Y = df.loc[:,'condition':'usefulCount'], df.loc[:, 'drugName']

In [58]:
from imblearn.under_sampling import RandomUnderSampler

# Perform undersampling on majority class

dict = {
    'Levonorgestrel': 1121
}

rus = RandomUnderSampler(ratio=dict)
X_resampled, Y_resampled = rus.fit_sample(X, Y)

ValueError: could not convert string to float: 'Left Ventricular Dysfunction'

In [32]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def createOneHotEncoded(arg):
    
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(arg)
    
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    
    return onehot_encoded

In [34]:
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from ._conv import register_converters as _register_converters

class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), axis=1)
        val_targ = np.argmax(self.validation_data[1], axis=1)
        
        _val_f1 = f1_score(val_targ, val_predict, average="weighted")
        _val_recall = recall_score(val_targ, val_predict, average="weighted")
        _val_precision = precision_score(val_targ, val_predict, average="weighted")
        
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        
        return

ModuleNotFoundError: No module named '__main__._conv'; '__main__' is not a package

In [35]:
from keras.layers import Dense, Dropout
from keras.models import Sequential

# Create neural network model
def createModel():
    model = Sequential()

    model.add(Dense(units=22, activation='relu', input_dim=22))
    model.add(Dense(units=32, activation='relu'))
    
    model.add(Dense(units=10, activation='softmax')) #10 species
    
    model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
    
    return model

In [36]:
from sklearn.metrics import classification_report

N_EPOCHS = 100
BATCH_SIZE = 32

metrics = []

def trainAndTestModel(model, x_train, y_train, x_val, y_val):
    
    # Where the current iteration's extra metrics will be stored
    metrics.append(Metrics())
    
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=BATCH_SIZE, epochs=N_EPOCHS, verbose=1, callbacks=[metrics[-1]])
    
    pred = model.predict(x_val, batch_size=32, verbose=0)
    y_pred = np.argmax(pred, axis=1)
    
    report = classification_report(argmax(y_val, axis=1), y_pred)
    
    mFeature = metricsByFeature(argmax(y_val, axis=1), y_pred)
    featuresMetrics.append(mFeature)
    
    print(report)
    
    return history 

In [37]:
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support

def metricsByFeature(y_true, y_pred):
        metricsSummary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)
        
        finalMetrics = {
            'precision': metricsSummary[0],
            'recall': metricsSummary[1],
            'f1-score': metricsSummary[2],
            'support': metricsSummary[3]
        }
        
        return finalMetrics

In [38]:
from sklearn.model_selection import StratifiedKFold

# Instantiate the cross validator
skf = StratifiedKFold(n_splits=5, shuffle=True)

historyList = []
featuresMetrics = []
metrics = []

# Actually train as test the model
for index, (train_indices, val_indices) in enumerate(skf.split(X_resampled, Y_resampled)):
    
    # Generate batches from indices
    xtrain, xval = X_resampled[train_indices], X_resampled[val_indices]
    ytrain, yval = createOneHotEncoded(Y_resampled[train_indices]), createOneHotEncoded(Y_resampled[val_indices])
    
    # Clear model, and create it
    model = None
    model = createModel()
    
    history = trainAndTestModel(model, xtrain, ytrain, xval, yval)
    
    historyList.append(history)

NameError: name 'X_resampled' is not defined