In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold

from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from scipy import sparse

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import zero_one_loss
from sklearn.metrics import log_loss
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

In [None]:
#replace "-" by "_" for attributes
train_fea_processed = train_features.replace("-", "_")
print(train_fea_processed.shape)
train_fea_processed.head()

In [None]:
# encode cp_type and cp_dose to binary
train_fea_processed['cp_type'] = train_fea_processed['cp_type'].replace({'trt_cp':1, 'ctl_vehicle':0})
train_fea_processed['cp_dose'] = train_fea_processed['cp_dose'].replace({'D1':1, 'D2':0})
train_fea_processed

In [None]:
# encode cp_time to binary
train_fea_processed = pd.get_dummies(train_fea_processed, columns=["cp_time"])
train_fea_processed

In [None]:
# swap cp_time columns.
train_fea_processed=pd.concat([train_fea_processed.iloc[:,0:2],train_fea_processed.iloc[:,875:878], train_fea_processed.iloc[:,2:875]], axis=1)
train_fea_processed

In [None]:
#replace "-" by "_" for label dataset
train_targets_scored_processed = train_targets_scored.replace("-", "_")
print(train_targets_scored_processed.shape)
train_targets_scored_processed.head()

In [None]:
#drop the samples having all zero in label profiles
train_targets_scored_processed["RowTotal"] = train_targets_scored_processed.sum(axis=1)
train_targets_scored_processed

In [None]:
train_targets_scored_processed_short = train_targets_scored_processed[train_targets_scored_processed['RowTotal'] > 0]
train_targets_scored_processed_short

In [None]:
data = train_fea_processed.merge(train_targets_scored_processed_short, on='sig_id')
data

In [None]:
# Get the Data
X = data.iloc[:,1:878]
X = np.array(X)
y = data.iloc[:,878:1084]
y = np.array(y)

# evaluate a model using repeated k-fold cross-validation

Accuracy_results = list()
Hamming_Loss_results = list()
Log_Loss_results = list()
Zero_One_Loss_results = list()

# define evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# enumerate folds
for train_ix, test_ix in cv.split(X):
    # prepare data
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    # initialize LabelPowerset multi-label classifier with a RandomForest
    classifier = ClassifierChain(
    classifier = RandomForestClassifier(n_estimators=100),
    require_dense = [False, True]
    )

    # train
    classifier.fit(X_train, y_train)

    # predict
    prediction = classifier.predict(X_test)
    
    # predict probabilities
    pred_prob = classifier.predict_proba(X_test)

    #Accuracy
    acc = accuracy_score(y_test, prediction.toarray())
    
    # Hamming Loss
    hamloss = hamming_loss(y_test, prediction.toarray())
    
    # Log Loss
    logloss = log_loss(y_test, pred_prob.toarray())
    
    # Zero_One Loss
    zer_one = zero_one_loss(y_test, prediction.toarray())
    
    
    print ('=====================================')
    print("Accuracy :", '>%.3f' % acc)
    Accuracy_results.append(acc)
    print("Hamming_Loss:",'>%.3f' % hamloss)
    Hamming_Loss_results.append(hamloss)
    print("Log_Loss:",'>%.3f' % logloss)
    Log_Loss_results.append(logloss)
    print("Zero_One_Loss:",'>%.3f' % zer_one)
    Zero_One_Loss_results.append(zer_one)


In [None]:
# summarize performance
print('Accuracy: %.3f (%.3f)' % (mean(acc), std(acc)))
print('Hamming_Loss: %.3f (%.3f)' % (mean(hamloss), std(hamloss)))
print('Log_Loss: %.3f (%.3f)' % (mean(logloss), std(logloss)))
print('Zero_One_Loss: %.3f (%.3f)' % (mean(zer_one), std(zer_one)))
