<a href="https://colab.research.google.com/github/felipepenha/data-science-bits/blob/master/performance_metrics/F-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from scipy.stats import beta
from scipy import integrate

def moving_average(x, n=2):
    '''
      Moving average

      References
      ----------
      [1] https://stackoverflow.com/a/14314054/2097158

      Attributes
      ----------
      x: numpy array
      n: window size
    '''

    ret = np.cumsum(x, dtype=float)

    ret[n:] = ret[n:] - ret[:-n]

    return ret[n - 1:] / n

In [2]:
# Two classes: 0 and 1

# Number of rows for each class
N_0 = 1000
N_1 = 100

# Random values generated from beta distributions
# Beta distributions were chosen, because they best represent
# distributions of values in the interval [0,1]
y_0 = [beta.rvs(1.5, 4.5, random_state=k) for k in range(0, N_0)]
y_1 = [beta.rvs(4, 2, random_state=k) for k in range(0, N_1)]

# Suffix to include in filenames
file_name_suffix = 'unbalanced_granular'

# Linearly spaced values in the interval [0,1]
x = np.linspace(start=0., stop=1., num=450)

# Histograms
hist_0, bin_edges = np.histogram(y_0, bins=x)
hist_1, bin_edges = np.histogram(y_1, bins=x)

# bin centers (prob) are moving averages of bin edges
df = pd.DataFrame({
        'prob': moving_average(bin_edges),
        '0': hist_0,
        'TN': np.cumsum(hist_0),
        'FP': np.sum(hist_0) - np.cumsum(hist_0),
        '1': hist_1,
        'TP': np.sum(hist_1) - np.cumsum(hist_1),
        'FN': np.cumsum(hist_1),
     })

# True Positive Rate
df['Precision'] = df['TP'] / (df['TP'] + df['FP'])

df['Precision'].fillna(value=1., inplace=True)

# False Positive Rate
df['Recall'] = df['TP'] / (df['TP'] + df['FN'])

df['Recall'].fillna(value=0., inplace=True)

# F-1 Score
df['F-1'] = (
    (2. * df['Recall'] * df['Precision']) /
    (df['Recall'] + df['Precision'])
)

df['F-1'].fillna(value=0., inplace=True)

# Arithmetic Mean
df['AM'] = (df['Recall'] + df['Precision']) / 2.

df['AM'].fillna(value=0., inplace=True)

# Arithmetic Mean
df['GM'] = np.sqrt(df['Recall'] * df['Precision'])

df['GM'].fillna(value=0., inplace=True)

In [3]:
df.head()

Unnamed: 0,prob,0,TN,FP,1,TP,FN,Precision,Recall,F-1,AM,GM
0,0.001114,2,2,998,0,100,0,0.091075,1.0,0.166945,0.545537,0.301786
1,0.003341,0,2,998,0,100,0,0.091075,1.0,0.166945,0.545537,0.301786
2,0.005568,2,4,996,0,100,0,0.091241,1.0,0.167224,0.54562,0.302061
3,0.007795,3,7,993,0,100,0,0.091491,1.0,0.167645,0.545746,0.302475
4,0.010022,5,12,988,0,100,0,0.091912,1.0,0.16835,0.545956,0.30317


In [4]:
df.tail()

Unnamed: 0,prob,0,TN,FP,1,TP,FN,Precision,Recall,F-1,AM,GM
444,0.989978,0,1000,0,0,0,100,1.0,0.0,0.0,0.5,0.0
445,0.992205,0,1000,0,0,0,100,1.0,0.0,0.0,0.5,0.0
446,0.994432,0,1000,0,0,0,100,1.0,0.0,0.0,0.5,0.0
447,0.996659,0,1000,0,0,0,100,1.0,0.0,0.0,0.5,0.0
448,0.998886,0,1000,0,0,0,100,1.0,0.0,0.0,0.5,0.0


In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1,
    cols=1
    )

data_dict = {}

idx = 200

data_dict['TN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['0'].iloc[:idx+1],
  'name': 'TN: True Negatives',
  'line': {'shape': 'hv', 'color': 'firebrick'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TN'], row=1, col=1)

data_dict['FP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['0'].iloc[idx:],
  'name': 'FP: False Positives',
  'line': {'shape': 'hv', 'color': 'green'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FP'], row=1, col=1)

data_dict['FN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['1'].iloc[:idx+1],
  'name': 'FN: False Negatives',
  'line': {'shape': 'hv', 'color': 'darkorange'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FN'], row=1, col=1)

data_dict['TP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['1'].iloc[idx:],
  'name': 'TP: True Positives',
  'line': {'shape': 'hv', 'color': 'cornflowerblue'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TP'], row=1, col=1)

y_max = max(np.max(df['0']), np.max(df['1'])) * 1.05

data_dict['threshold'] = {
  'x': [df['prob'].iloc[idx], df['prob'].iloc[idx]],
  'y': [0., y_max],
  'name': 'threshold',
  'line': {'color': 'grey', 'width': 3, 'dash': 'dot'},
  'mode': 'lines'
}

fig.add_trace(data_dict['threshold'], row=1, col=1)

epsilon = 1.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Output Probability',
    row=1, col=1
)

fig.update_yaxes(
    range=[0, y_max],
    title='Frequency',
    row=1, col=1
)

fig.update_layout(
    font=dict(
        family='Courier New, monospace',
        size=20,
        color='Gray'
    )
)

fig.update_layout(
    autosize=False,
    width=900,
    height=600
)

fig.show()

fig.write_html('distr_{}.html'.format(file_name_suffix))

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1,
    cols=2,
    horizontal_spacing=0.15,
    subplot_titles=('', '')
    )

data_dict = {}

idx = 200

data_dict['TN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['0'].iloc[:idx+1],
  'name': 'TN: True Negatives',
  'line': {'shape': 'hv', 'color': 'firebrick'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TN'], row=1, col=1)

data_dict['FP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['0'].iloc[idx:],
  'name': 'FP: False Positives',
  'line': {'shape': 'hv', 'color': 'green'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FP'], row=1, col=1)

data_dict['FN'] = {
  'x': df['prob'].iloc[:idx+1],
  'y': df['1'].iloc[:idx+1],
  'name': 'FN: False Negatives',
  'line': {'shape': 'hv', 'color': 'darkorange'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['FN'], row=1, col=1)

data_dict['TP'] = {
  'x': df['prob'].iloc[idx:],
  'y': df['1'].iloc[idx:],
  'name': 'TP: True Positives',
  'line': {'shape': 'hv', 'color': 'cornflowerblue'},
  'mode': 'lines',
  'fill': 'tozeroy'
}

fig.add_trace(data_dict['TP'], row=1, col=1)

y_max = max(np.max(df['0']), np.max(df['1'])) * 1.05

data_dict['threshold'] = {
  'x': [df['prob'].iloc[idx], df['prob'].iloc[idx]],
  'y': [0., y_max],
  'name': 'Threshold',
  'line': {'color': 'grey', 'width': 3, 'dash': 'dot'},
  'mode': 'lines'
}

fig.add_trace(data_dict['threshold'], row=1, col=1)

epsilon = 1.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Output Probability',
    row=1, col=1
)

fig.update_yaxes(
    range=[0, y_max],
    title='Frequency',
    row=1, col=1
)

data_dict['Precision'] = {
  'x': df['prob'],
  'y': df['Precision'],
  'name': 'Precision',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'gray',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Precision'], row=1, col=2)

data_dict['Recall'] = {
  'x': df['prob'],
  'y': df['Recall'],
  'name': 'Recall',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'darkgray',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['Recall'], row=1, col=2)

data_dict['AM'] = {
  'x': df['prob'],
  'y': df['AM'],
  'name': 'Arithmetic Mean',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'navy',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['AM'], row=1, col=2)

data_dict['GM'] = {
  'x': df['prob'],
  'y': df['GM'],
  'name': 'Geometric Mean',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'blue',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['GM'], row=1, col=2)

data_dict['F-1'] = {
  'x': df['prob'],
  'y': df['F-1'],
  'name': 'F-1 Score',
  'text': df['prob'],
  'line': {
      'shape': 'linear',
      'color': 'magenta',
      'width': 1,
      'smoothing': 0.
      },
  'marker': {'size': 3},
  'mode': 'lines+markers'
}

fig.add_trace(data_dict['F-1'], row=1, col=2)

epsilon = 5.e-2

fig.update_xaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='Threshold',
    row=1, col=2
)

fig.update_yaxes(
    range=[0 - epsilon, 1 + epsilon],
    title='',
    row=1, col=2
)

fig.update_layout(
    font=dict(
        family='Courier New, monospace',
        size=20,
        color='Gray'
    )
)

fig.update_layout(
    autosize=False,
    width=1200,
    height=600
)

fig.show()

fig.write_html('F1_{}.html'.format(file_name_suffix))