<a href="https://colab.research.google.com/github/gokulanv/Data_Shapley_NLP/blob/master/AFLite_mimic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [129]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### One time config

In [130]:
data, labels = load_breast_cancer(return_X_y = True)
train_size = 200
predictability_score = [0.0] * data.shape[0]
m = 10

classifiers = [
               LogisticRegression(),
               LinearSVC(),
               KNeighborsClassifier(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               GradientBoostingClassifier()
               ]

pipe = Pipeline([
                 ('normalizer', StandardScaler()),
                 ('clf', None)
])

#Check if all data points are unique
unique_rows = np.unique(data, axis=0)
assert unique_rows.shape == data.shape

# TODO: Remove duplicate points, if any


# check if indexed properly
test_indices = np.random.randint(0,data.shape[0], 10)
for idx in test_indices:
    assert data2idx[tuple(data[idx])] == idx

data2idx = dict({tuple(v):k for k,v in enumerate(data)})
idx2count_sampled_points = dict({k: 0 for k in range(data.shape[0] + 1)})
idx2count_classified_correctly = dict({k: 0 for k in range(data.shape[0] + 1)})

assert sum(idx2count_sampled_points[k] for k in idx2count_sampled_points.keys()) == 0
assert sum(idx2count_classified_correctly[k] for k in idx2count_sampled_points.keys()) == 0
assert len(idx2count_classified_correctly) == len(idx2count_sampled_points)


### Utils

In [131]:
def get_random_train_test_split(train_size: int) -> tuple:
    return train_test_split(data, labels, train_size = train_size)

def update_test_datapoints_count(Xtest: np.ndarray, idx2count: dict, indices: list = None) -> None:
    if indices:
        # update only given indices
        Xtest = Xtest[indices]
    [idx2count.__setitem__(data2idx[tuple(dp)], idx2count[data2idx[tuple(dp)]] + 1)  for dp in Xtest]

update_test_datapoints_count(Xtest, idx2count_sampled_points)

### Models pipeline

In [132]:

# AFLite random sampling
for i in range(m):
    Xtrain, Xtest, Ytrain, Ytest = get_random_train_test_split(train_size)

    for clf in classifiers:
        pipe.set_params(clf = clf)
        # scores = cross_validate(pipe, data, labels)
        grid_search = GridSearchCV(pipe, {}, n_jobs=4)
        grid_search.fit(Xtrain, Ytrain)

        correct_pred_indices = [i for i in range(len(Xtest)) if grid_search.predict(np.expand_dims(Xtest[i], axis=0)) == Ytest[i]]
        update_test_datapoints_count(Xtest, idx2count_classified_correctly, indices = correct_pred_indices)

    for i in range(len(predictability_score)):
        predictability_score[i] += idx2count_classified_correctly[i]/(idx2count_sampled_points[i] + 1e-9)


In [137]:
print('#data_points < 100 test score threshold = ', str(np.sum(np.asarray(predictability_score) < 100)) )

#data_points < 100 test score threshold =  19


### Logistic Regression

In [16]:
clf = LogisticRegression(random_state=0, n_jobs=4, max_iter=100)
clf.fit(Xtrain, Ytrain)
clf.score(Xtrain, Ytrain), clf.score(Xtest, Ytest)

(0.97, 0.8780487804878049)