# Compare two different gather CSV files to find differences, if any

In [1]:
import pandas as pd
from itertools import zip_longest

In [2]:
ls *.csv

SRR606249.gather.csv
SRR606249.prefetch.csv
SRR606249.x.combined-matches.gather.picklist.csv
srr.fg.csv
srr.fmg.csv


In [3]:
gather1 = pd.read_csv('SRR606249.x.combined-matches.gather.picklist.csv')
gather2 = pd.read_csv('srr.fmg.csv')

## Compare column names, since we sometimes use different column names...


In [4]:
set(gather1.columns) - set(gather2.columns)

{'filename', 'md5', 'name', 'potential_false_negative'}

In [5]:
set(gather2.columns) - set(gather1.columns)

{'match_filename', 'match_md5', 'match_name'}

In [6]:
gather2 = gather2.rename(columns={ 'match_name': 'name', 'match_md5': 'md5', 'match_filename': 'filename'})

## Convert pandas dataframes into something simple to compare

In [7]:
gather1_rows = [ list(tup[1]) for tup in gather1[['f_unique_to_query', 'name', 'remaining_bp']].iterrows() ]
gather1_rows.sort(reverse=True)
gather1_rows[:3]

[[0.0218961439346692,
  'GCF_000013645.1 Paraburkholderia xenovorans LB400 strain=LB400, ASM1364v1',
  415611000],
 [0.017151665627243,
  'GCF_000009705.1 Nostoc sp. PCC 7120 = FACHB-418 strain=PCC 7120, ASM970v1',
  408323000],
 [0.0165656660743913,
  'GCF_000196115.1 Rhodopirellula baltica SH 1 strain=1, ASM19611v1',
  401284000]]

In [8]:
gather2_rows = [ list(tup[1]) for tup in gather2[['f_unique_to_query', 'name', 'remaining_bp']].iterrows() ]
gather2_rows.sort(reverse=True)
gather2_rows[:3]

[[0.0218961439346692,
  'GCF_000013645.1 Paraburkholderia xenovorans LB400 strain=LB400, ASM1364v1',
  415611000],
 [0.017151665627243,
  'GCF_000009705.1 Nostoc sp. PCC 7120 = FACHB-418 strain=PCC 7120, ASM970v1',
  408323000],
 [0.0165656660743913,
  'GCF_000196115.1 Rhodopirellula baltica SH 1 strain=1, ASM19611v1',
  401284000]]

## Compare!

In [9]:
for x, y in zip_longest(gather1_rows, gather2_rows, fillvalue=(None, None, None)):
    print (x[0], y[0], round(x[0], 3) == round(y[0], 3), x[1] == y[1])
    if x[1] != y[1]:
        print('***\t', x[1])
        print('\t', y[1])
        assert 0

0.0218961439346692 0.0218961439346692 True True
0.017151665627243 0.017151665627243 True True
0.0165656660743913 0.0165656660743913 True True
0.0155419319157949 0.0155419319157949 True True
0.0148876834190367 0.0148876834190367 True True
0.0127601990986432 0.0127601990986432 True True
0.0122071473118153 0.0122071473118153 True True
0.0121388983679088 0.0121388983679088 True True
0.01191297082946 0.01191297082946 True True
0.011882376475295 0.011882376475295 True True
0.0116423284656931 0.0116423284656931 True True
0.0111622324464892 0.0111622324464892 True True
0.0108939434945812 0.0108939434945812 True True
0.010658602308697 0.010658602308697 True True
0.0100255345186684 0.0100255345186684 True True
0.0098066672157961 0.0098066672157961 True True
0.0092536154289681 0.0092536154289681 True True
0.0090700493039784 0.0090700493039784 True True
0.0088888365908475 0.0088888365908475 True True
0.0086040737559276 0.0086040737559276 True True
0.0076415283056611 0.0076415283056611 True True
0.

# do some additional checking

In [10]:
def extract_column_to_list(df, name):
    col = df[[name]]
    xx = [ list(tup[1])[0] for tup in col.iterrows() ]
    return xx

extract_column_to_list(gather1, 'moltype')[:5]

['DNA', 'DNA', 'DNA', 'DNA', 'DNA']

In [11]:
round5 = lambda x: round(x, 5)
fix_columns = {}
fix_columns['moltype'] = lambda x: x.upper()
fix_columns['std_abund'] = round5
fix_columns['f_unique_weighted'] = round5
fix_columns['query_md5'] = lambda x: x[:8]

xx_same = []
xx_diff = []

diff_values = {}
for name in set(gather1.columns).intersection(set(gather2.columns)):
    col1 = extract_column_to_list(gather1, name)
    col2 = extract_column_to_list(gather2, name)
    if name in fix_columns:
        fn = fix_columns[name]
        col1 = [ fn(i) for i in col1 ]
        col2 = [ fn(i) for i in col2 ]
    col1.sort()
    col2.sort()
    same = col1 == col2
    if not same:
        xx_diff.append(name)
        diff_values[name] = (col1, col2)
    else:
        xx_same.append(name)

print("same:")
print("*", "\n* ".join(xx_same))
print("diff:")
print("*", "\n* ".join(xx_diff))

same:
* f_unique_weighted
* remaining_bp
* std_abund
* f_match_orig
* moltype
* md5
* total_weighted_hashes
* ksize
* query_name
* intersect_bp
* f_orig_query
* query_bp
* scaled
* n_unique_weighted_found
* name
* query_abundance
* query_filename
* median_abund
* unique_intersect_bp
* gather_result_rank
* f_unique_to_query
* f_match
* query_md5
* average_abund
diff:
* filename
* max_containment_ani
* match_containment_ani
* sum_weighted_found
* average_containment_ani
* query_n_hashes
* query_containment_ani


## display the differences

In [12]:
for name in xx_diff:
    print('column:', name)
    print('\t', diff_values[name][0][:5])
    print('\t', diff_values[name][1][:5])

column: filename
	 ['/Users/t/dev/2024-debug-gather-difference/combined-matches-k31.sig.zip', '/Users/t/dev/2024-debug-gather-difference/combined-matches-k31.sig.zip', '/Users/t/dev/2024-debug-gather-difference/combined-matches-k31.sig.zip', '/Users/t/dev/2024-debug-gather-difference/combined-matches-k31.sig.zip', '/Users/t/dev/2024-debug-gather-difference/combined-matches-k31.sig.zip']
	 ['signatures/0376e0e4e8f660a71512eec2bcbbc793.sig.gz', 'signatures/03c6e78f14ab8af588f748653db2d6fb.sig.gz', 'signatures/05abaf28975f70b7399c549052f64e01.sig.gz', 'signatures/08a7e8122383feb4f03413e87457084a.sig.gz', 'signatures/09035b7cd317402c14fc924318845177.sig.gz']
column: max_containment_ani
	 [0.8749386857759329, 0.8904077865681246, 0.8953922532404506, 0.904302420512725, 0.9056884895571736]
	 [0.8557860744055599, 0.858813070934693, 0.8721298182460694, 0.8735847304990553, 0.8856904039640281]
column: match_containment_ani
	 [0.8749386857759329, 0.8904077865681246, 0.8953922532404506, 0.9043024205

In [13]:
# WTF
x, y = diff_values['sum_weighted_found']
for i, j in list(zip(x, y)):
    print(i, j, i==j)

62498 62498 True
236385 236385 True
901311 901311 True
1008705 1008705 True
1118456 1118456 True
1193684 1193684 True
1254446 1254446 True
1309980 1309980 True
1386711 1386711 True
1514077 1514077 True
1582287 1582287 True
1668260 1668260 True
1950653 1950653 True
1993145 1993145 True
2052525 2052525 True
2283220 2283220 True
2357379 2357379 True
2681284 2681284 True
2881423 2881423 True
2973486 2973486 True
3035985 3035985 True
3083836 3083836 True
3167290 3167290 True
3257426 3257426 True
3348580 3348580 True
3474044 3474044 True
3551638 3551638 True
3788839 3788839 True
3854935 3854935 True
3932129 3932129 True
4123436 4123436 True
4215229 4215229 True
4236597 4236597 True
4322125 4322125 True
4389460 4389460 True
4424879 4424879 True
4565008 4565008 True
4612731 4612731 True
4691026 4691026 True
4772858 4772858 True
4848277 4848277 True
5029371 5029371 True
5048037 5048037 True
5106644 5106644 True
5169630 5169630 True
5273487 5273487 True
5380082 5380082 True
5492824 5492824 True
