In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
# Read info df
df = pd.read_csv('enterobase_serotype.csv').drop('Unnamed: 0', axis=1)
count = df.shape[0]
df.columns = ['genome_name', 'given_H', 'given_O', 'serotype_tag']
df.given_H='H'+df.loc[df.given_H.notnull()].given_H.astype(int).astype(str)
df.given_O='O'+df.loc[df.given_O.notnull()].given_O.astype(int).astype(str)
df2 = pd.read_csv('blacklist.csv')
df = df.merge(df2, on='genome_name', how='outer', indicator=True)
df['blacklisted'] = df._merge!='left_only'
df.drop('_merge', axis=1, inplace=True)
print('%d rows removed from blacklisted genome' %(count-df.shape[0]))
info_df = df

0 rows removed from blacklisted genome


In [3]:
def summarize_result(result_df):
    important_cols = ['genome_name', 'given_O', 'predicted_O', 'O_info', 'given_H', 'predicted_H', 'H_info', 'wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA', 'serotype_tag']
    df = result_df[~result_df.blacklisted]
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    correct_count = 0
    incorrect_count = 0
    given_count = 0
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    given_count += num_total
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    given_count += num_total
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    print("Overall concordance=%.2f%%(%d/%d)" %(correct_count/given_count*100, correct_count,given_count))
    print("Overall discrepancies=%.2f%%(%d/%d)" %(incorrect_count/given_count*100, incorrect_count, given_count))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, result_df

In [4]:
# Read from result file
df = pd.read_csv('output/2017-11-09_14.20.30.622082/output.csv')
df.columns = ['genome_name', 'predicted_O', 'O_info', 'predicted_H', 'H_info', 'wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
df.loc[df.predicted_O=='-', 'predicted_O'] = np.nan
df.loc[df.predicted_H=='-', 'predicted_H'] = np.nan
result_df = df

In [5]:
# merge with info file
df = result_df
df = df.merge(info_df, on='genome_name', how='left')
df = df[['genome_name', 'given_O', 'predicted_O', 'O_info', 'given_H', 'predicted_H', 'H_info', 'wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA', 'serotype_tag', 'blacklisted']]
merge_df = df
display(merge_df)

Unnamed: 0,genome_name,given_O,predicted_O,O_info,given_H,predicted_H,H_info,wzx,wzy,wzm,wzt,fliC,fllA,flkA,flmA,flnA,serotype_tag,blacklisted
0,ESC_AA7875AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
1,ESC_AA7899AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
2,ESC_AA7929AA_AS,O157,O26,Alignment found,,H11,Alignment found,True,True,-,-,True,-,-,-,-,O157,True
3,ESC_AA7930AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
4,ESC_AA7942AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
5,ESC_AA7966AA_AS,O113,O113,Alignment found,H21,H21,Alignment found,True,True,-,-,True,-,-,-,-,O113:H21,False
6,ESC_AA7970AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
7,ESC_AA7978AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
8,ESC_AA7989AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
9,ESC_AA8002AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False


In [6]:
results = summarize_result(merge_df)

all predictions(5826):
39 no predictions
280 incorrect predictions
4278 correct predictions
744 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5209
number of unpredicted_O serotypes is 821 or 15.76%
number of correctly predicted_O serotypes is 4227 or 81.15%
number of incorrectly predicted_O serotypes is 161 or 3.09%
number of given_H serotypes is 2566
number of unpredicted_H serotypes is 40 or 1.56%
number of correctly predicted_H serotypes is 2384 or 92.91%
number of incorrectly predicted_H serotypes is 142 or 5.53%
Overall concordance=85.03%(6611/7775)
Overall discrepancies=3.90%(303/7775)


In [7]:
results[-1].to_csv('enterobase_result.csv')

In [8]:
results[-1]

Unnamed: 0,genome_name,given_O,predicted_O,O_info,given_H,predicted_H,H_info,wzx,wzy,wzm,wzt,fliC,fllA,flkA,flmA,flnA,serotype_tag,blacklisted
0,ESC_AA7875AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
1,ESC_AA7899AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
2,ESC_AA7929AA_AS,O157,O26,Alignment found,,H11,Alignment found,True,True,-,-,True,-,-,-,-,O157,True
3,ESC_AA7930AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
4,ESC_AA7942AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
5,ESC_AA7966AA_AS,O113,O113,Alignment found,H21,H21,Alignment found,True,True,-,-,True,-,-,-,-,O113:H21,False
6,ESC_AA7970AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
7,ESC_AA7978AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
8,ESC_AA7989AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
9,ESC_AA8002AA_AS,O157,O157,Alignment found,,H7,Alignment found,True,True,-,-,True,-,-,-,-,O157,False
