In [1]:
__author__ = 'Denson Smith'

'''
Created on Fri Mar 31 14:35:34 2017

Function to compute the K-category correlation coefficient
 
@author: Denson Smith

'''
 

'\nCreated on Fri Mar 31 14:35:34 2017\n\nFunction to compute the K-category correlation coefficient\n \n@author: Denson Smith\n\n'

## Comparing two K-category assignments by a K-category correlation coefficient

### Abstract


Predicted assignments of biological sequences are often evaluated by Matthews correlation coefficient. However, Matthews correlation coefficient applies only to cases where the assignments belong to two categories, and cases with more than two categories are often artificially forced into two categories by considering what belongs and what does not belong to one of the categories, leading to the loss of information. Here, an extended correlation coefficient that applies to K-categories is proposed, and this measure is shown to be highly applicable for evaluating prediction of RNA secondary structure in cases where some predicted pairs go into the category “unknown” due to lack of reliability in predicted pairs or unpaired residues. Hence, predicting base pairs of RNA secondary structure can be a three-category problem. The measure is further shown to be well in agreement with existing performance measures used for ranking protein secondary structure predictions. 

Server and software is available at http://rk.kvl.dk/

Paper is available at http://www.sciencedirect.com/science/article/pii/S1476927104000799

Imports

In [2]:
import numpy as np
 
 
from sklearn.metrics import classification_report ,confusion_matrix

Define the function

In [3]:
def compute_RkCC(confusion_matrix):     
    
    '''
    Function to compute the K-category correlation coefficient
    http://www.sciencedirect.com/science/article/pii/S1476927104000799
    
    http://rk.kvl.dk/suite/04022321447260711221/
    

    Parameters
    ----------
    confusion_matrix : k X k confusion matrix of int

    n_samples : int


    Returns
    -------
    RkCC: float
    
    
    '''

    rows, cols = np.shape(confusion_matrix)
    
    RkCC_numerator=0
    for k_ in range(cols):
        for l_ in range(cols):
            for m_ in range(cols):

                this_term = (confusion_matrix[k_,k_] * confusion_matrix[m_,l_]) - \
                    (confusion_matrix[l_,k_] * confusion_matrix[k_,m_])

                RkCC_numerator = RkCC_numerator + this_term

    RkCC_denominator_1=0           
    for k_ in range(cols):
        RkCC_den_1_part1=0
        for l_ in range(cols):
            RkCC_den_1_part1= RkCC_den_1_part1+confusion_matrix[l_,k_]

        RkCC_den_1_part2=0
        for f_ in range(cols):
            if f_ != k_:

                for g_ in range(cols):

                    RkCC_den_1_part2= RkCC_den_1_part2+confusion_matrix[g_,f_]

        RkCC_denominator_1=(RkCC_denominator_1+(RkCC_den_1_part1*RkCC_den_1_part2))



    RkCC_denominator_2=0           
    for k_ in range(cols):
        RkCC_den_2_part1=0
        for l_ in range(cols):
            RkCC_den_2_part1= RkCC_den_2_part1+confusion_matrix[k_,l_]

        RkCC_den_2_part2=0
        for f_ in range(cols):
            if f_ != k_:

                for g_ in range(cols):

                    RkCC_den_2_part2= RkCC_den_2_part2+confusion_matrix[f_,g_]

        RkCC_denominator_2=(RkCC_denominator_2+(RkCC_den_2_part1*RkCC_den_2_part2))

    RkCC = (RkCC_numerator)/(np.sqrt(RkCC_denominator_1)* np.sqrt(RkCC_denominator_2))
    
    return RkCC

In [4]:
print('Using scikit-learn')

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
 
labels = [0,1,2]

confusion_matrix = confusion_matrix(y_true,y_pred,labels = labels)

print(confusion_matrix)
 
print(classification_report(y_true, y_pred, target_names=target_names))
 

RkCC = compute_RkCC(confusion_matrix)
 
print('Rk correlation coefficient = %.4f' % RkCC)

print('')
print("Example from http://rk.kvl.dk/doc/dataformat.html:")
print("213 21 12 89 459 90 29 39 958")
confusion_matrix = np.array([[213, 21, 12], [89, 459, 90], [29, 39, 958]])


RkCC = compute_RkCC(confusion_matrix)

print(confusion_matrix)
print('Rk correlation coefficient = %.4f' % RkCC)


Using scikit-learn
[[1 0 0]
 [1 0 0]
 [0 1 2]]
             precision    recall  f1-score   support

    class 0       0.50      1.00      0.67         1
    class 1       0.00      0.00      0.00         1
    class 2       1.00      0.67      0.80         3

avg / total       0.70      0.60      0.61         5

Rk correlation coefficient = 0.4009

Example from http://rk.kvl.dk/doc/dataformat.html:
213 21 12 89 459 90 29 39 958
[[213  21  12]
 [ 89 459  90]
 [ 29  39 958]]
Rk correlation coefficient = 0.7550
