# Compare two different gather CSV files to find differences, if any

In [14]:
import pandas as pd
from itertools import zip_longest

In [2]:
ls *.csv

SRR606249.prefetch.csv
SRR606249.x.combined-matches.gather.csv
srr.fg.csv
srr.fmg.csv


In [4]:
gather1 = pd.read_csv('SRR606249.x.combined-matches.gather.csv')
gather2 = pd.read_csv('srr.fmg.csv')

## Compare column names, since we sometimes use different column names...


In [10]:
set(gather1.columns) - set(gather2.columns)

{'filename', 'md5', 'name', 'potential_false_negative'}

In [11]:
set(gather2.columns) - set(gather1.columns)

{'match_filename', 'match_md5', 'match_name'}

## Convert pandas dataframes into something simple to compare

In [12]:
gather1_rows = [ list(tup[1]) for tup in gather1[['f_unique_to_query', 'name', 'remaining_bp']].iterrows() ]
gather1_rows.sort(reverse=True)
gather1_rows[:3]

[[0.0218961439346692,
  'GCF_000013645.1 Paraburkholderia xenovorans LB400 strain=LB400, ASM1364v1',
  194064000],
 [0.017151665627243,
  'GCF_000009705.1 Nostoc sp. PCC 7120 = FACHB-418 strain=PCC 7120, ASM970v1',
  186776000],
 [0.0165656660743913,
  'GCF_000196115.1 Rhodopirellula baltica SH 1 strain=1, ASM19611v1',
  179737000]]

In [9]:
gather2_rows = [ list(tup[1]) for tup in gather2[['f_unique_to_query', 'match_name', 'remaining_bp']].iterrows() ]
gather2_rows.sort(reverse=True)
gather2_rows[:3]

[[0.0218961439346692,
  'GCF_000013645.1 Paraburkholderia xenovorans LB400 strain=LB400, ASM1364v1',
  415611000],
 [0.017151665627243,
  'GCF_000009705.1 Nostoc sp. PCC 7120 = FACHB-418 strain=PCC 7120, ASM970v1',
  408323000],
 [0.0165656660743913,
  'GCF_000196115.1 Rhodopirellula baltica SH 1 strain=1, ASM19611v1',
  401284000]]

## Compare!

In [15]:
for x, y in zip_longest(gather1_rows, gather2_rows, fillvalue=(None, None, None)):
    print (x[0], y[0], round(x[0], 3) == round(y[0], 3), x[1] == y[1])
    if x[1] != y[1]:
        print('***\t', x[1])
        print('\t', y[1])

0.0218961439346692 0.0218961439346692 True True
0.017151665627243 0.017151665627243 True True
0.0165656660743913 0.0165656660743913 True True
0.0155419319157949 0.0155419319157949 True True
0.0148876834190367 0.0148876834190367 True True
0.0127601990986432 0.0127601990986432 True True
0.0122071473118153 0.0122071473118153 True True
0.0121388983679088 0.0121388983679088 True True
0.01191297082946 0.01191297082946 True True
0.011882376475295 0.011882376475295 True True
0.0116423284656931 0.0116423284656931 True False
***	 GCF_000012825.1 Bacteroides vulgatus ATCC 8482 strain=ATCC 8482, ASM1282v1
	 GCF_002959625.1 Bacteroides vulgatus strain=ATCC 8492, ASM295962v1
0.0111622324464892 0.0111622324464892 True True
0.0108939434945812 0.0108939434945812 True True
0.010658602308697 0.010658602308697 True True
0.0100255345186684 0.0100255345186684 True True
0.0098066672157961 0.0098066672157961 True True
0.0092536154289681 0.0092536154289681 True True
0.0090700493039784 0.0090700493039784 True T