In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import keras
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from IPython.display import display
import os
import time
print(os.listdir("../input/a1e1f6c6-8-dataset (1)/"))
PATH = "../input/a1e1f6c6-8-dataset (1)"
# Any results you write to the current directory are saved as output.

The “train.csv” file contains historical patient information from Jan 2011 to Dec 2013.
The “test.csv” file contains a list of Patient IDs for which we aim to predict the next 10 events for in the year 2014. Event codes should be considered to be categorical in nature, not continuous.

In [None]:
train_data = pd.read_csv(f'{PATH}/train.csv')
train_data.head()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.info()

In [None]:
train_data.describe(include='all')

In [None]:
df = train_data.Date.astype('str').str.extract(r'(?P<Year>^\d{4})(?P<Month>\d{2}$)')

In [None]:
train_data['Year'] = df['Year']
train_data['Month'] = df['Month']
train_data.head()

In [None]:
del df
# Drop the date column
train_data = train_data.drop(['Date'], axis=1)

In [None]:
#Number of events each year
train_data.groupby(['Year']).count()['Event_Code'].plot(kind='bar')

In [None]:
# Number of user with total count of events asscociated with particular user
train_data.groupby('UID').count()['Event_Code'].sort_values(ascending=False)

In [None]:
# Number of events occured with their count
labels = train_data.groupby('Event_Code').count()['UID'].sort_values(ascending=False)
label = labels / len(train_data)
x = np.arange(len(label))
y = labels.cumsum()
y /= y[-1]
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(x, y)

There are total of 6472 unique labels in dataset. <br>
The top 227 codes make up 50% of the total labels in the set.<br>
770 labels have only one example in the dataset.<br>
1740 labels have fewer than 5 examples in the dataset.<br>

In [None]:
# Number of events occured each month all 3 years
train_data.groupby('Month').count()['Event_Code'].plot(kind='bar')

## New dataset

We will use a second filtered dataset using the <b>top 250 ICD-9 labels</b>.<br>
Number of patients (unqiue UID) remains the same after removing all non-250 ICD-9 labels. <br>
Size of dataset reduces from 766787 to 397076 which is almost 48% reduction which is cool because we selected top 250 labels which comprises of 50% of our dataset.<br>

In [None]:
df = labels.reset_index()
valid_labels = df.Event_Code[:250]
print (valid_labels[:5])

In [None]:
new_data = train_data['Event_Code'].isin(valid_labels)
new_data = train_data[new_data == True]
print ('Before deleting')
print (len(train_data))
print ('After deleting')
print (len(new_data))
new_data.head()

In [None]:
# New number of unique patient UID with their count of events
df = new_data.groupby('UID').count()['Event_Code'].sort_values(ascending=False)
df

In [None]:
#Number of events each year
new_data.groupby(['Year']).count()['Event_Code'].plot(kind='bar')

In [None]:
# Number of events occured with their count
labels = new_data.groupby('Event_Code').count()['UID'].sort_values(ascending=False)
label = labels / len(new_data)
x = np.arange(len(label))
y = labels.cumsum()
y /= y[-1]
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(x, y)
print (label[:10])

Top 46 IDC-9 Codes comprise of 50% of new dataset.

In [None]:
df = df.reset_index()
not_valid_df = df[df.Event_Code <= 10]
not_valid_uid = not_valid_df['UID']
df = df[df.Event_Code > 10]
valid_uid = df['UID']
print (len(valid_uid), len(not_valid_uid))

new_uid = new_data.UID.isin(valid_uid)
print ('Before deleting')
print (len(new_data))
new_data = new_data[new_uid == True]
print ('After deleting')
print (len(new_data))
new_data.head()

# Preprocessing

1. Convert UID into unqiue encodings.
1. Convert categorical labels into unique encodings.
2. Convert labels into one hot encodings of 250 classes.

### Train Validation Split
 
 We will use last 10 events for every patient as a validation set.<br>
 We will lose 19 number of patients as 19 number of patients have less than 10 events.<br>
 Size of dataset reduces from 397076 to 396937 i.e. 139 entries. <br>
 The size of validation set comes around 30000 entries which is almost 10% of total dataset.<br>

In [None]:
# Create train validation set based by selecting last 10 events of each patient as validation set.
display(new_data.head())
sorted_data = new_data.sort_values(by=['UID', 'Year', 'Month'])
display(sorted_data.head())

In [None]:
df = df.sort_values(['UID'])
df.head()

In [None]:
start = time.time()
train_set = pd.DataFrame()
val_set = pd.DataFrame()
y = df['Event_Code']
y = y.cumsum()
train_set = sorted_data.iloc[0:y.iloc[0]-10]
val_set = sorted_data.iloc[y.iloc[0]-10:y.iloc[0]]
for i in range(1, len(y)):
    if i%100 == 0: print (i, 'completed')
    train_set = pd.concat([train_set, sorted_data.iloc[y.iloc[i-1]:y.iloc[i]-10]], ignore_index=True)
    val_set = pd.concat([val_set, sorted_data.iloc[y.iloc[i]-10:y.iloc[i]]], ignore_index=True)
    
print ('Splitting done in', (time.time()-start), 'sec')

In [None]:
display(train_set.head())
display(val_set.head())
print(train_set.shape, val_set.shape)

In [None]:
np.save('train_250.npy', train_set)
np.save('valid_250.npy', val_set)

In [None]:
combine_data = pd.concat([train_set, val_set], ignore_index=True)
combine_data = combine_data.drop(['Gender', 'Age', 'Month', 'Year'], axis=1)
display(combine_data.head())

In [None]:
# Label Encoding
lb = LabelEncoder()
x = combine_data.UID.unique()
combine_data.UID = lb.fit_transform(combine_data.UID)
le = LabelEncoder()
combine_data.Event_Code = le.fit_transform(combine_data.Event_Code)
display(combine_data.head())
print (combine_data.shape)
print (len(combine_data.UID.unique()))
print (len(combine_data.Event_Code.unique()))

In [None]:
combine_data.groupby('Event_Code').count()['UID'].sort_values(ascending=False)

In [None]:
c = combine_data['Event_Code']
d = to_categorical(c, num_classes=1000)
print (d.shape)

In [None]:
#Train val split
train = combine_data[:367127]
val = combine_data[367127:]
# train = combine_data[:584416]
# val = combine_data[584416:]
labels = d

In [None]:
import gc
del  d, train_data, train_set, val_set, df, sorted_data
# del combine_data
gc.collect()

In [None]:
train_x = train['UID'].values
train_score_y = train['Event_Code'].values
train_y = labels[:367127]
val_x = val['UID'].values
val_score_y = val['Event_Code'].values
val_y = labels[367127:]
print ('Training Shape:', train_x.shape, train_y.shape)
print ('Validation Shape:', val_x.shape, val_y.shape)

In [None]:
del train, val, labels, not_valid_df
gc.collect()

# Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.metrics import ndcg_score 
from sklearn.metrics import accuracy_score, make_scorer

In [None]:
"""Metrics to compute the model performance."""

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=10)

In [None]:
classifiers = [
#     LogisticRegression(C=0.000000001, max_iter=400),
    KNeighborsClassifier(10),
    GaussianNB(),
#     SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=400),
    AdaBoostClassifier()]

In [None]:
# Accuracy=[]
# Model=[]
# for clf in classifiers:
#     start = time.time()
#     clf.fit(train_x.reshape(-1, 1), train_score_y)
#     pred = clf.predict_proba(val_x.reshape(-1, 1))
#     ncdg_sc = ndcg_score(val_score_y, pred, k=10)
# #     acc = accuracy_score(val_y.reshape(-1, 1), pred)
#     Accuracy.append(ncdg_sc)
# #     Accuracy.append(acc)
#     Model.append(clf.__class__.__name__)
#     print('NCDG score of '+ clf.__class__.__name__+' is '+ str(ncdg_sc))
#     print ('Time taken for training', time.time()-start, 'sec')
# #     print('Accuracy score of '+ clf.__class__.__name__+' is '+ str(acc))  

In [None]:
# Index = [1,2,3,4,5,6]
# plt.bar(Index,Accuracy)
# plt.xticks(Index, Model, rotation=45)
# plt.ylabel('Accuracy')
# plt.xlabel('Model')
# plt.title('NCDG scores of Models')

# Neural Network Approach

In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [None]:
# train_X = np.array(train_x.reshape(-1, 1))
# train_Y = np.array(train_y)
# val_X = np.array(val_x.reshape(-1, 1))
# val_Y = np.array(val_y)
# print ('Training Shape:', train_X.shape, train_Y.shape)
# print ('Validation Shape:', val_X.shape, val_Y.shape)

In [None]:
# # Fully Connected Dense Network

# model = Sequential()
# model.add(Dense(1, activation='relu', input_dim=train_x.reshape(-1, 1).shape[1]))
# model.add(Dense(512, activation='relu'))
# model.add(Dense(250, activation='sigmoid'))

# model.compile(loss='categorical_crossentropy', optimizer='SGD')
# es = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

In [None]:
# history = model.fit(train_x.reshape(-1, 1), train_y,
#                    epochs=10,
#                    batch_size=256,
#                    validation_data=(val_x.reshape(-1, 1), val_y),
#                    callbacks=[es])
# pred = model.predict_proba(val_x.reshape(-1, 1))
# ncdg_sc = ndcg_score(val_score_y, pred, k=10)
# print ('Neural Network NCDG Score:', ncdg_sc)

In [None]:
# test_data = np.array(test_data).reshape(-1, 1)
# test_pred = model.predict(test_data)
# print (test_pred.shape)

In [None]:
test_data = pd.read_csv(f'{PATH}/test.csv')
display(test_data.head())
print (test_data.shape)

In [None]:
for u in not_valid_uid:
    test_data = test_data[test_data.UID != u]
print (test_data.UID.shape)

In [None]:
sub = pd.DataFrame()
sub['UID'] = test_data['UID']
display(sub.head())
print (sub.shape)

In [None]:
# y = test_data.UID.unique()
# print (len(test_data.UID.unique()), len(combine_data.UID.unique()))
# count = 0
# for i in range(len(test_data.UID.unique())):
#     if (x[i] == y[i]):
#         count += 1
#     else:
#         print (x[i], y[i])
#         print (i)
# print (count)

In [None]:
# lb.classes_

In [None]:
for c in test_data.columns:
    print (c)
    test_data[c] = lb.transform(test_data[c])
print (test_data.UID.shape)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(train_x.reshape(-1, 1), train_score_y)
pred = clf.predict_proba(val_x.reshape(-1, 1))
ncdg_sc = ndcg_score(val_score_y, pred, k=10)
print (ncdg_sc)

In [None]:
test_data = np.array(test_data).reshape(-1, 1)
test_pred = clf.predict_proba(test_data)
print (test_pred.shape)

In [None]:
test_pred_df = pd.DataFrame(test_pred)
display(test_pred_df.head())

In [None]:
test_pred_df = pd.DataFrame(test_pred)
display(test_pred_df.head())

In [None]:
test_event_codes = []
for i in range(len(test_pred_df)):
    x = test_pred_df.loc[i]
    df = x.sort_values(ascending=False)
    x = df.index[:10]
    x = le.inverse_transform(x)
    test_event_codes.append(x)

In [None]:
sub1 = pd.DataFrame(np.array(test_event_codes), columns=['Event1', 'Event2', 'Event3', 'Event4', 'Event5', 'Event6', 'Event7', 'Event8', 'Event9', 'Event10'])
display(sub1.head())
print (sub1.shape)

In [None]:
event_codes = []
uids = []
for u in not_valid_uid:
    uids.append(u)
    event_codes.append(np.array([0] * 10))
uid = pd.DataFrame(np.array(uids), columns=['UID'])
display(uid.head())
event_code = pd.DataFrame(np.array(event_codes), columns=['Event1', 'Event2', 'Event3', 'Event4', 'Event5', 'Event6', 'Event7', 'Event8', 'Event9', 'Event10'])
display(event_code.head())
print (uid.shape, event_code.shape)

In [None]:
sub = sub.append(uid)
display(sub.tail())

sub1 = sub1.append(event_code, ignore_index=True)
display(sub1.tail())

print (sub.shape, sub1.shape)

In [None]:
submit = sub.join(sub1)
display(submit.head())
print (submit.shape)

In [None]:
submit = submit.sort_values(by='UID')
display(submit.head())
print (submit.shape)

In [None]:
submit.to_csv('submit_dt_top250_events.csv', index=False)