In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from torch.utils.tensorboard import SummaryWriter
from scipy.stats import entropy

sns.set_style("white")
sns.set(rc={'figure.figsize':(10,5)})
import random


KeyboardInterrupt



###Scatter_it Function

In [None]:
def scatter_it(dataframe,x_axis="",y_axis="", hue_metric ='', title='', model='', show_hist=False):
    # Subsample data to plot, so the plot is not too busy.
    dataframe = dataframe.sample(n=25000 if dataframe.shape[0] > 25000 else len(dataframe))


#    # Normalize correctness to a value between 0 and 1.
#    dataframe = dataframe.assign(corr_frac = lambda d: d.correctness / d.correctness.max())
#    dataframe['correct.'] = [f"{x:.1f}" for x in dataframe['corr_frac']]

    if not show_hist:
        fig, axs = plt.subplots(1, 1, figsize=(8, 4))
        ax0 = axs
    else:
        fig = plt.figure(figsize=(16, 10), )
        gs = fig.add_gridspec(2, 3, height_ratios=[5, 1])

        ax0 = fig.add_subplot(gs[0, :])


    ### Make the scatterplot.

    # Choose a palette.


    plot = sns.scatterplot(x=x_axis,
                           y=y_axis,
                           hue=hue_metric,
                           ax=ax0,
                           data=dataframe,
                           palette="RdYlGn",
                           s=30)
    

    if not show_hist:
        plot.legend(ncol=1, bbox_to_anchor=(1.01, 0.5), loc='center left', fancybox=True, shadow=True)
    else:
        plot.legend(fancybox=True, shadow=True,  ncol=1)
    plot.set_xlabel(x_axis)
    plot.set_ylabel(y_axis)

    if show_hist:
        plot.set_title(f"{model}-{title} Data Map", fontsize=17)

        # Make the histograms.
        ax1 = fig.add_subplot(gs[1, 0])
        ax2 = fig.add_subplot(gs[1, 1])
        ax3 = fig.add_subplot(gs[1, 2])

        plott0 = dataframe.hist(column=[y_axis], ax=ax1, color='#622a87')
        plott0[0].set_title('')
        plott0[0].set_xlabel(y_axis)
        plott0[0].set_ylabel('density')

        plott1 = dataframe.hist(column=[x_axis], ax=ax2, color='teal')
        plott1[0].set_title('')
        plott1[0].set_xlabel(x_axis)

        plot2 = sns.histplot(x=hue_metric, data=dataframe, color='#86bf91', ax=ax3,bins=10)
        ax3.xaxis.grid(True) # Show the vertical gridlines

        plot2.set_title('')
        plot2.set_xlabel(hue_metric)
        plot2.set_ylabel('')

### Preparing Data

In [None]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [None]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

In [None]:
##classes in dataset 0=T-shirt ...
classes = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

###AUM

In [None]:
class AreaUnderTheMarginRanking():
    def __init__(self):
        # hist_delta_AUM_current_epoch dimensions: [n_sample, 2 (from in_logit & max(out_logits))]
        self.hist_delta_AUM_current_epoch = torch.zeros(size=(0, 2))
        # hist_delta_AUM dimensions: [n_epoch, n_sample, in_logit & max(out_logits)]
        self.hist_delta_AUM = torch.zeros(size=(0, 0, 2))
        self.reference_sample_idx = []

    def accumulate(self, batch_logits, batch_ids, batch_targets):
        """ To be called after batch prediction"""
        for img_logit, img_id, img_target in zip(batch_logits.split([1] * len(batch_logits), dim=0),
                                                 batch_ids.split([1] * len(batch_logits), dim=0),
                                                 batch_targets.split([1] * len(batch_logits), dim=0)):
            img_logit = img_logit.squeeze(dim=0)
            target_logit = img_logit[img_target]
            if img_target < len(img_logit) - 1:
                notarget_logits = torch.cat([img_logit[:img_target], img_logit[img_target + 1:]], dim=0)
            else:
                notarget_logits = img_logit[:img_target]
            notarget_logits = notarget_logits.max()
            self.hist_delta_AUM_current_epoch = torch.cat(
                [self.hist_delta_AUM_current_epoch, torch.tensor([[target_logit, notarget_logits]])], dim=0)

    def accumulate_epoch(self):
        """ To be called at the end of each epoch"""
        if len(self.hist_delta_AUM) == 0:
            self.hist_delta_AUM = self.hist_delta_AUM_current_epoch.unsqueeze(dim=0)
        else:
            self.hist_delta_AUM = torch.cat([self.hist_delta_AUM, self.hist_delta_AUM_current_epoch.unsqueeze(dim=0)],
                                            dim=0)
        self.hist_delta_AUM_current_epoch = torch.zeros(size=(0, 2))

In [None]:
deneme = AreaUnderTheMarginRanking()

###Model

In [None]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

####Pred Hist 

In [None]:
prediction_history = torch.zeros(0,60000,10) ## Epoch Number x Dataset Size x Class number
prediction_history = prediction_history.to(device) ## to GPU

### Train Function

In [None]:
def train(dataloader, model, loss_fn, optimizer):

  global prediction_history ##

  size = len(dataloader.dataset)
  model.train()

  epoch_pred=torch.zeros(0,10) ### 10 = class number ## prediction table(tensor)
  epoch_pred = epoch_pred.to(device) ###

  for batch, (X, y) in enumerate(dataloader):
    X, y = X.to(device), y.to(device)

    # Compute prediction error
    pred = model(X)

    ###record new predictions to table

    epoch_pred=torch.cat((epoch_pred,pred)) 

    ##AUM ACCUMULATE

    img_id = range(batch,batch+len(X))

    img_id = torch.tensor(img_id).to(device)

    deneme.accumulate(pred, img_id, y)


    ##Calculate LOSS
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
        loss, current = loss.item(), batch * len(X)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


  ### add prediction history of this epoch to full history
  prediction_history =torch.cat((prediction_history,torch.unsqueeze(epoch_pred, 0)))
  deneme.accumulate_epoch()

### Test Function

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

###Training & Evaluating Model

In [None]:
epochs = 50

prediction_history = torch.zeros(0,60000,10) ## re-defined when train model again
prediction_history = prediction_history.to(device) ##

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

In [None]:
model.eval()
x, y = test_data[0][0], test_data[0][1]

if device =="cuda":
  x=x.to(device)

with torch.no_grad():
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

### Data Cartography

In [None]:
num_pred_hist=(torch.softmax(prediction_history,dim=2)).detach().cpu().numpy() ##turning logit values to probability values -> to cpu -> to numpy epoch num * dataset size * class num

In [None]:
confidence_means=np.mean(num_pred_hist,0)     ## getting confidence means from nmpy
std=np.std(num_pred_hist,0)                   ## getting standart deviation from nmpy
targets=training_data.targets.cpu().numpy()   ## getting targets from dataset

In [None]:
df = pd.DataFrame(targets, columns=["Label"])                                                           ## creating dataframe for instance features
df["Confidence Mean"] = np.take_along_axis(confidence_means,np.expand_dims(targets, axis=1),axis=1)     ## adding confidence mean 
df["Pred Std"] = np.take_along_axis(std,np.expand_dims(targets, axis=1),axis=1)                              ## adding std
df["Last Pred"]=num_pred_hist[epochs-1].argmax(axis=1)                                                  ## last prediction of model
df["Last Pred Corr"]=(df["Last Pred"]==df["Label"])

In [None]:
epoch_pred=num_pred_hist.argmax(2).transpose()                                              ## getting which label predicted for every epoch
correctness_matrix = ( np.tile(targets,(epochs,1)).transpose() ==epoch_pred).astype(int)    ## getting matrix of either prediction correct
df["Correctness"]= correctness_matrix.mean(axis=1)                                          ## Not sure when paper said correctness they mean this !!

In [None]:
df.dtypes

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(30, 7),sharex=True,sharey=True)
sns.scatterplot(ax=axes[0],data=df,x="Pred Std",y="Confidence Mean",hue="Correctness",palette="RdYlGn")
sns.scatterplot(ax=axes[1],data=df, x="Pred Std", y="Confidence Mean",hue="Label",palette="tab10")
plt.show()

fig, axes = plt.subplots(len(classes)//5, 5, figsize=(30, 12),sharex=True,sharey=True)
for i in range(len(classes)):
  r,c=i//5,i%5
  sns.scatterplot(data = df[df["Label"]==i],ax=axes[r,c] ,x = "Pred Std", y="Confidence Mean",hue="Correctness",palette="RdYlGn")
  axes[r,c].set_title(classes[i])

In [None]:
fig, axes = plt.subplots(len(classes)+1, 2, figsize=(30, (len(classes)+1)*7),sharex=True,sharey=True)

sns.kdeplot(ax=axes[0,0], data=df, x="Confidence Mean",hue="Label",palette="tab10") ### plot 2 kde for confidence and std hu=label
sns.kdeplot(ax=axes[0,1], data=df, x="Pred Std",hue="Label",palette="tab10") ### plot 2 kde for confidence and std hu=label

for i in range(len(classes)):
  sns.kdeplot(ax=axes[i,0], data=df[df["Label"]==i],x="Confidence Mean")
  sns.kdeplot(ax=axes[i,1], data=df[df["Label"]==i],x="Pred Std")

In [None]:
labels=pd.DataFrame({"Label":targets})

In [None]:
## it creates DF at the bottom instance*epochs -> prediction of correct label
pd.DataFrame(np.take_along_axis( num_pred_hist, np.atleast_3d(targets),axis=2).squeeze().transpose(), columns=range(1,epochs+1)).head(10)   

In [None]:
label_confidence = pd.concat([labels, pd.DataFrame(np.take_along_axis( num_pred_hist, np.atleast_3d(targets),axis=2).squeeze().transpose(), columns=range(1,epochs+1))],axis=1)
label_pred_grouped = label_confidence.groupby("Label").mean().transpose()
##columns: labels, rows: epoch

correctness_df = pd.DataFrame(correctness_matrix,columns=range(1,epochs+1)) 
correctness_df["Label"]=targets
correctness_df_grouped=correctness_df.groupby("Label").mean().transpose()    

In [None]:
#### Binary Accuracy vs Confidence
############### first part

fig, axes = plt.subplots(1, 2, figsize=(24, 10),sharex=True)

fig.suptitle('History of Classes',size=40)

legend=[]
for col in label_pred_grouped.columns:
  legend.append(classes[col])
  sns.lineplot(ax=axes[0],data=label_pred_grouped[col])
  axes[0].set_ylim(0, 1)

axes[0].legend(legend)
axes[0].set_title("Confidence of items by labels")


for col in correctness_df_grouped.columns:
  sns.lineplot(ax=axes[1],data=correctness_df_grouped[col])

axes[1].legend(legend)
axes[1].set_title("Binary Accuracy of labels")

plt.show()

############### second part

fig, axes = plt.subplots((len(classes)+2)//3 , 3, figsize=(24, 4*((len(classes)+1)//2)),sharex=True,sharey=True)
for num,col in enumerate(label_pred_grouped.columns):
  r,c=(num)//3,num%3
  sns.lineplot(ax=axes[r,c],data=label_pred_grouped[col])
  sns.lineplot(ax=axes[r,c],data=correctness_df_grouped[col])

  axes[r,c].legend(["Confidence Mean","Binary Accuracy"])
  axes[r,c].set_title(classes[col])

plt.show()

###AUM

In [None]:
num_AUM_history= (deneme.hist_delta_AUM).cpu().numpy()
label_logits=num_AUM_history.transpose(2,1,0)[0]                   ##this can be optimized either here or above since we used this and getting was expensive
highest_non_logits = num_AUM_history.transpose(2,1,0)[1]
AUM_scores = label_logits.mean(axis=1)-highest_non_logits.mean(axis=1)

In [None]:
df["AUM"]=AUM_scores

In [None]:
#df.AUM.plot.hist()
sns.kdeplot(df["AUM"])

In [None]:
df.head()

In [None]:
selected=[]

plt.figure(figsize=(40,10))
for i in range(8):
  plt.subplot(1,8,i+1)
  num=random.randint(0,df.shape[0])
  selected.append(num)
  plt.xticks([])
  plt.yticks([])
  plt.grid(False)
  plt.imshow(training_data.data[num], cmap=plt.cm.binary)
plt.show()

fig, axes = plt.subplots(1, 8, figsize=(40, 5),sharex=True,sharey=True)
for i in range(8):
  num=selected[i]
  plt.xlabel(classes[targets[num]])
  sns.lineplot(ax=axes[i],x=range(1,epochs+1),y=label_logits[num],color="Green")
  sns.lineplot(ax=axes[i],x=range(1,epochs+1),y=highest_non_logits[num],color="Red")


  Label = classes[df["Label"][num]]
  pred  = classes[df["Last Pred"][num]]
  axes[i].set_title(f"Label: {Label}\nLast Pred: {pred}")

  axes[i].fill_between(
    range(1,epochs+1), label_logits[num], highest_non_logits[num], where=(label_logits[num] > highest_non_logits[num]), 
    interpolate=True, color="green", alpha=0.25, 
    label="Positive"
  )

  axes[i].fill_between(
      range(1,epochs+1), label_logits[num], highest_non_logits[num], where=(label_logits[num] <= highest_non_logits[num]), 
      interpolate=True, color="red", alpha=0.25,
      label="Negative"
  )

###Forgetting

In [None]:
prediction_history.shape

In [None]:
### 1 forgot -1 learned
action = (correctness_matrix[:,:-1]-correctness_matrix[:,1:]) 

forgots_in_epoch=(action== np.ones(action.shape)).astype(int)
learns_in_epoch=(action== -np.ones(action.shape)).astype(int)

df["Forget Nums"]=forgots_in_epoch.sum(axis=1)
df["Learn Nums"]=learns_in_epoch.sum(axis=1) + correctness_matrix[:,0]

forgots=df.index[df["Forget Nums"]!=0]

In [None]:
print("Total forgets during training= ",df["Forget Nums"].sum())
print("Total forgotten instances during training= ",(df["Forget Nums"]!=0).sum())

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(24, 30))
fig.suptitle('Forget Num Dists',size=40)

sns.boxplot(ax=axes[0,0], data=df,x="Forget Nums",y="Confidence Mean")
sns.boxplot(ax=axes[0,1],data=df,x="Forget Nums",y="Pred Std")

sns.kdeplot(ax=axes[1,0], data=df,x="Confidence Mean",hue="Forget Nums")

legend=[]
for i in list(df["Forget Nums"].unique()):
  sns.kdeplot(ax=axes[1,1], data=df[df["Forget Nums"]==i]["Confidence Mean"], shade=True)
  legend.append(i)
axes[1,1].legend(legend)
axes[1,1].set_title("Number of forgets-Correct Label Confidence Density")

sns.kdeplot(ax=axes[2,0],data=df,x="Pred Std",hue="Forget Nums")

for i in list(df["Forget Nums"].unique()):
  sns.kdeplot(ax=axes[2,1], data=df[df["Forget Nums"]==i]["Pred Std"], shade=True)
  legend.append(i)
axes[2,1].legend(legend)
axes[2,1].set_title("Number of forgets-Correct Label Pred Std")

sns.kdeplot(ax=axes[3,0],data=df,x="AUM",hue="Forget Nums")

for i in list(df["Forget Nums"].unique()):
  sns.kdeplot(ax=axes[3,1], data=df[df["Forget Nums"]==i]["AUM"], shade=True)
  legend.append(i)
axes[3,1].legend(legend)
axes[3,1].set_title("Number of forgets-Correct Label Pred Std")

####### THESE WILL BE IMPLEMENTED FOR HUE = LABEL AT THE TOP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


#sns.scatterplot(ax=axes[3,0],data=df, x="Pred Std", y="Confidence Mean", hue="Forget Nums", palette="RdYlGn_r")
#sns.scatterplot(ax=axes[3,1],data=df, x="Pred Std", y="Confidence Mean", hue="Learn Nums", palette="RdYlGn")

In [None]:
forget_nums=list(df["Forget Nums"].unique())
fig, axes = plt.subplots((len(forget_nums)-1)//2+1, 2, figsize=(24, 13))

fig.suptitle('Data map per Forget Nums',size=25)

for num, i in enumerate(forget_nums):
  r,c=num//2, num%2
  sns.scatterplot(ax=axes[r, c], data=df[df["Forget Nums"]==i], x="Pred Std", y="Confidence Mean", hue="Correctness", palette="RdYlGn", alpha=0.6,)
  axes[r, c].set_title(f"Forget Nums = {i}")
  axes[r, c].set_xlim(left=0, right=0.5)
  axes[r, c].set_ylim(bottom=0, top=1);

In [None]:
def plot_labeled(df,x_axis,y_axis,hue_): 

    df["Forget Nums"]*=6000

    def plotlabel(xvar, yvar, label):
        ax.text(xvar+0.003, yvar+0.005, label)
        
    fig = plt.figure(figsize=(14,7))
    ax = sns.scatterplot(x = x_axis, y = y_axis, data=df, s=500, hue=hue_,)

    df.apply(lambda x: plotlabel(x[x_axis],  x[y_axis], classes[int(x["Label"])]), axis=1)
   
    ax.set_xlim(0, 0.3)
    ax.set_ylim(0, 1)

    plt.title(f"{y_axis} - {x_axis}")
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)

#### WARNING STATISTICALLY STD SHOULD NOT BE CALCULATED LIKE THIS //:TODO                                                                 ### ALSO IMPLEMENT FOR ALL DATA SET
plot_labeled(df.groupby("Label", as_index =False).mean(), "Pred Std","Confidence Mean", "Forget Nums")

#### mean preds of label in forgotten

In [None]:
label_pred_forget= label_confidence[df["Forget Nums"]!=0]                                 ## forgotten instances
label_pred_forget_grouped = label_pred_forget.groupby("Label").mean().transpose()         ## forgotten instances -> grouped -> transposed

correctness_df_forget=correctness_df[df["Forget Nums"]!=0]
correctness_df_forget_grouped=correctness_df_forget.groupby("Label").mean().transpose()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(24, 7))

fig.suptitle('History of Forgotten itmes')

legend=[]
for col in label_pred_forget_grouped.columns:
  legend.append(classes[col])
  sns.lineplot(ax=axes[0],data=label_pred_forget_grouped[col])
  axes[0].set_ylim(0, 1)

axes[0].legend(legend)
axes[0].set_title("Confidence of forgotten items by labels")


for col in correctness_df_forget_grouped.columns:
  sns.lineplot(ax=axes[1],data=correctness_df_forget_grouped[col])

axes[1].legend(legend)
axes[1].set_title("Binary Accuracy of labels that have been forgotten at least once")

plt.show()

In [None]:
fig, axes = plt.subplots((len(classes)+2)//3 , 3, figsize=(24, 5*((len(classes)+1)//2)),sharex=True,sharey=True)
fig.suptitle('History of Forgotten items by label', size = 40)
for num,col in enumerate(label_pred_forget_grouped.columns):
  r,c=(num)//3,num%3
  sns.lineplot(ax=axes[r,c],data=label_pred_forget_grouped[col])
  sns.lineplot(ax=axes[r,c],data=correctness_df_forget_grouped[col])

  axes[r,c].legend(["Confidence Mean","Binary Accuracy"])
  axes[r,c].set_title(classes[col])

#### distribution of forget events to epochs

In [None]:
sns.lineplot(data=forgots_in_epoch.sum(axis=0))  ##xtics should +=2

#### ENTROPY

In [None]:
entropies = pd.DataFrame(entropy(num_pred_hist, qk=None, base=None, axis=2).transpose()) ## got entropy of predictions for every instance in epochs 
entropies["Label"]=targets
df["Entropy Mean"] = entropies.mean(axis=1)    
df["Entropy Std"] = entropies.std(axis=1)                                              ## std could work,too. to give how entropy changes

In [None]:
sns.scatterplot(data=df, x="Pred Std", y="Confidence Mean", hue="Entropy Mean", palette="RdYlGn_r", alpha=0.6)

In [None]:
fig, axes = plt.subplots(2 , 2, figsize=(14, 14),sharex=False,sharey=False)

sns.scatterplot(ax=axes[0,0], data=df, x="Pred Std", y="Confidence Mean", hue="Correctness", palette="RdYlGn", alpha=0.6)
sns.scatterplot(ax=axes[0,1], data=df, x="AUM", y="Entropy Mean", hue="Correctness", palette="RdYlGn", alpha=0.6)
sns.scatterplot(ax=axes[1,0], data=df, x="Entropy Std", y="Entropy Mean", hue="Correctness", palette="RdYlGn", alpha=0.6)
sns.scatterplot(ax=axes[1,1], data=df, x="Entropy Std", y="Entropy Mean", hue="Label", palette="tab10", alpha=0.6)

In [None]:
entropies.groupby("Label").mean().transpose().plot.line(title="Entropy means of labels at each epoch")

In [None]:
fig, axes = plt.subplots(1 , 2, figsize=(20, 10),sharex=False,sharey=False)
sns.histplot(df["Entropy Mean"],ax=axes[0])
sns.histplot(df["Entropy Std"], ax =axes[1])

###Last Status

In [None]:
df.head()

#Not Important much

### Confidence Learning // You can Skip

In [None]:
!pip install cleanlab

In [None]:
import requests
import io
import cleanlab
from cleanlab.filter import find_label_issues

#### Health Summary

In [None]:
cleanlab.dataset.health_summary(targets, num_pred_hist[-1], class_names=classes)

####Label Scoring

In [None]:
issues = find_label_issues(labels=targets, pred_probs=num_pred_hist[-1]) ###pred should be updated after cross validation supplied to code !!!!!!
print(issues)

In [None]:
plot_examples(issues[range(15)], 3, 5)

####Plot Founded Label Issues

In [None]:
## plot founded label issues
plt.figure(figsize=(10,10))
for i in range(50,75):
  plt.subplot(5,5,i-49)
  plt.xticks([])
  plt.yticks([])
  plt.grid(False)
  plt.imshow(training_data.data[issues][i], cmap=plt.cm.binary)
  plt.xlabel(classes[targets[issues][i]])
plt.show()

####Overlapping Classess

In [None]:
import graph_tool as gt

In [None]:
##Overlapping Classes
overlaps=cleanlab.dataset.find_overlapping_classes(
    labels=targets, 
    class_names=classes,
    pred_probs=num_pred_hist[-1])

overlaps["Joint Probability%"]=overlaps["Joint Probability"]*100

In [None]:
vals = np.unique(overlaps[['Class Name A', 'Class Name B']])
overlap_adj=overlaps.pivot(index='Class Name A', columns='Class Name B', values='Joint Probability%').reindex(columns=vals, index=vals, fill_value=0).fillna(0)
overlap_adj+=overlap_adj.transpose()

In [None]:
overlap_adj.sum(axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(14,14)) 
ax=sns.heatmap(overlap_adj,annot=True,vmin=0,vmax=5, fmt='g',cmap="Reds",linewidths=2)
ax.set_title('Joint Probability%',)
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
plt.xticks(rotation = 0)
plt.yticks(rotation = 0)

##clustering

In [None]:
df.head()

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters = 3, random_state = 0)
df["clusters"] = kmeans.fit_predict(df)

In [None]:
sns.set(rc={'figure.figsize':(14,7)})
sns.scatterplot(data=df, x="std", y="Confidence Mean", hue="clusters", palette="deep")