In [1]:
import numpy as np
import pandas as pd

from scipy.stats import beta
from scipy import integrate

def moving_average(x, n=2):
    '''
      Moving average

      References
      ----------
      [1] https://stackoverflow.com/a/14314054/2097158

      Attributes
      ----------
      x: numpy array
      n: window size
    '''

    ret = np.cumsum(x, dtype=float)

    ret[n:] = ret[n:] - ret[:-n]

    return ret[n - 1:] / n

In [2]:
# Two classes: 0 and 1

# Number of rows for each class
N_0 = 1000
N_1 = 1000

# Random values generated from beta distributions
# Beta distributions were chosen, because they best represent
# distributions of values in the interval [0,1]
y_0 = [beta.rvs(1.5, 4.5, random_state=k) for k in range(0, N_0)]
y_1 = [beta.rvs(3, 3, random_state=k) for k in range(0, N_1)]

# Suffix to include in filenames
file_name_suffix = 'balanced'

# Linearly spaced values in the interval [0,1]
x = np.linspace(start=0., stop=1., num=50)

# Histograms
hist_0, bin_edges = np.histogram(y_0, bins=x)
hist_1, bin_edges = np.histogram(y_1, bins=x)

# bin centers (prob) are moving averages of bin edges
df = pd.DataFrame({
        'prob': moving_average(bin_edges),
        '0': hist_0,
        'TN': np.cumsum(hist_0),
        'FP': np.sum(hist_0) - np.cumsum(hist_0),
        '1': hist_1,
        'TP': np.sum(hist_1) - np.cumsum(hist_1),
        'FN': np.cumsum(hist_1),
     })

# True Positive Rate
df['Precision'] = df['TP'] / (df['TP'] + df['FP'])

df['Precision'].fillna(value=1., inplace=True)

# False Positive Rate
df['Recall'] = df['TP'] / (df['TP'] + df['FN'])

df['Recall'].fillna(value=0., inplace=True)

# F-1 Score
df['F1'] = (
    (2. * df['Recall'] * df['Precision']) /
    (df['Recall'] + df['Precision'])
)

df['F1'].fillna(value=0., inplace=True)

# Accuracy
df['Accuracy'] = (
    (df['TN'] + df['TP']) /
    (df['TN'] + df['TP'] + df['FN'] + df['FP'])
)

# Matthews Correlation Coefficient
df['Matthews'] = (
    (df['TP'] * df['TN'] - df['FP'] * df['FN']) /
    np.sqrt(
        (df['TP'] + df['FP'])
        * (df['TP'] + df['FN'])
        * (df['TN'] + df['FP'])
        * (df['TN'] + df['FN'])
    )
)

df['Matthews'].fillna(value=0., inplace=True)

In [3]:
df.head()

Unnamed: 0,prob,0,TN,FP,1,TP,FN,Precision,Recall,F1,Accuracy,Matthews
0,0.010204,23,23,977,0,1000,0,0.505817,1.0,0.671817,0.5115,0.10786
1,0.030612,44,67,933,1,999,1,0.517081,0.999,0.681446,0.533,0.18209
2,0.05102,38,105,895,1,998,2,0.527205,0.998,0.689941,0.5515,0.22886
3,0.071429,47,152,848,3,995,5,0.539881,0.995,0.699965,0.5735,0.273278
4,0.091837,51,203,797,3,992,8,0.5545,0.992,0.711366,0.5975,0.317386


In [4]:
df.tail()

Unnamed: 0,prob,0,TN,FP,1,TP,FN,Precision,Recall,F1,Accuracy,Matthews
44,0.908163,0,1000,0,3,5,995,1.0,0.005,0.00995,0.5025,0.050063
45,0.928571,0,1000,0,3,2,998,1.0,0.002,0.003992,0.501,0.031639
46,0.94898,0,1000,0,2,0,1000,1.0,0.0,0.0,0.5,0.0
47,0.969388,0,1000,0,0,0,1000,1.0,0.0,0.0,0.5,0.0
48,0.989796,0,1000,0,0,0,1000,1.0,0.0,0.0,0.5,0.0


In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1,
    cols=1
    )

data_dict = {}

idx = 30

data_dict['TN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['0'].iloc[:idx+1],
  'name': 'TN: True Negatives',
  'line': {'shape': 'hv', 'color': 'firebrick'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TN'], row=1, col=1)

data_dict['FP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['0'].iloc[idx:],
  'name': 'FP: False Positives',
  'line': {'shape': 'hv', 'color': 'green'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FP'], row=1, col=1)

data_dict['FN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['1'].iloc[:idx+1],
  'name': 'FN: False Negatives',
  'line': {'shape': 'hv', 'color': 'darkorange'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FN'], row=1, col=1)

data_dict['TP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['1'].iloc[idx:],
  'name': 'TP: True Positives',
  'line': {'shape': 'hv', 'color': 'cornflowerblue'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TP'], row=1, col=1)

y_max = max(np.max(df['0']), np.max(df['1'])) * 1.05

data_dict['threshold'] = {
  'x': [df['prob'].iloc[idx], df['prob'].iloc[idx]],
  'y': [0., y_max],
  'name': 'threshold',
  'line': {'color': 'grey', 'width': 3, 'dash': 'dot'},
  'mode': 'lines'
}

fig.add_trace(data_dict['threshold'], row=1, col=1)

epsilon = 1.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Output Probability',
    row=1, col=1
)

fig.update_yaxes(
    range=[0, y_max],
    title='Frequency',
    row=1, col=1
)

fig.update_layout(
    font=dict(
        family='Courier New, monospace',
        size=20,
        color='Gray'
    )
)

fig.update_layout(
    autosize=False,
    width=900,
    height=600
)

fig.show()

fig.write_html('distr_{}.html'.format(file_name_suffix))

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1,
    cols=2,
    horizontal_spacing=0.15,
    subplot_titles=('', '')
    )

data_dict = {}

idx = 30

data_dict['TN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['0'].iloc[:idx+1],
  'name': 'TN: True Negatives',
  'line': {'shape': 'hv', 'color': 'firebrick'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TN'], row=1, col=1)

data_dict['FP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['0'].iloc[idx:],
  'name': 'FP: False Positives',
  'line': {'shape': 'hv', 'color': 'green'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FP'], row=1, col=1)

data_dict['FN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['1'].iloc[:idx+1],
  'name': 'FN: False Negatives',
  'line': {'shape': 'hv', 'color': 'darkorange'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FN'], row=1, col=1)

data_dict['TP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['1'].iloc[idx:],
  'name': 'TP: True Positives',
  'line': {'shape': 'hv', 'color': 'cornflowerblue'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TP'], row=1, col=1)

y_max = max(np.max(df['0']), np.max(df['1'])) * 1.05

data_dict['threshold'] = {
  'x': [df['prob'].iloc[idx], df['prob'].iloc[idx]],
  'y': [0., y_max],
  'name': 'Threshold',
  'line': {'color': 'grey', 'width': 3, 'dash': 'dot'},
  'mode': 'lines'
}

fig.add_trace(data_dict['threshold'], row=1, col=1)

epsilon = 1.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Output Probability',
    row=1, col=1
)

fig.update_yaxes(
    range=[0, y_max],
    title='Frequency',
    row=1, col=1
)

data_dict['Precision'] = {
  'x': df['prob'],
  'y': df['Precision'],
  'name': 'Precision',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'gray',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Precision'], row=1, col=2)

data_dict['Recall'] = {
  'x': df['prob'],
  'y': df['Recall'],
  'name': 'Recall',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'darkgray',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Recall'], row=1, col=2)

data_dict['F1'] = {
  'x': df['prob'],
  'y': df['F1'],
  'name': 'F1 score',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'magenta',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['F1'], row=1, col=2)

data_dict['Accuracy'] = {
  'x': df['prob'],
  'y': df['Accuracy'],
  'name': 'Accuracy',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'navy',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Accuracy'], row=1, col=2)

data_dict['Matthews'] = {
  'x': df['prob'],
  'y': df['Matthews'],
  'name': 'Matthews Corr.',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'darkcyan',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Matthews'], row=1, col=2)

epsilon = 5.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Threshold',
    row=1, col=2
)

fig.update_yaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='',
    row=1, col=2
)

fig.update_layout(
    font=dict(
        family='Courier New, monospace',
        size=20,
        color='Gray'
    )
)

fig.update_layout(
    autosize=False,
    width=1200,
    height=600
)

fig.show()

fig.write_html('Matthews_correlation_{}.html'.format(file_name_suffix))