# DNANexus Login

In [None]:
!pip install dxpy

import dxpy as dx

DX_SECURITY_CONTEXT = {
        "auth_token_type": "Bearer",
        "auth_token": ''
    }

dx.set_security_context(DX_SECURITY_CONTEXT)

dx.whoami()

# Comparison

In [None]:
old_genetranscript_dxfile = dx.DXFile('file-GV4P970433Gj6812zGVBZvB4').read() #230421
new_genetranscript_dxfile = dx.DXFile('file-GgBG6p8433Gg7XK69fG7P9j7').read()

In [None]:
import pandas as pd

old_gt = pd.DataFrame([row.split('\t') for row in old_genetranscript_dxfile.split('\n')], columns=['hgnc', 'transcript', 'clinical', 'canonical'])
new_gt = pd.DataFrame([row.split('\t') for row in new_genetranscript_dxfile.split('\n')], columns=['hgnc', 'transcript', 'clinical', 'canonical'])

old_gt.dropna(inplace=True)
new_gt.dropna(inplace=True)

In [None]:
import collections

old_hgnc_to_transcripts = collections.defaultdict(list)
old_hgnc_tx_to_clinical = collections.defaultdict(list)# if multiple clinical?

for idx, row in old_gt.iterrows():
  hgnc = row['hgnc'].strip()
  tx = row['transcript'].strip()
  clinical = row['clinical'].strip() == 'clinical_transcript'

  if clinical:
    old_hgnc_tx_to_clinical[hgnc].append(tx)

  old_hgnc_to_transcripts[hgnc].append(tx)

new_hgnc_to_transcripts = collections.defaultdict(list)
new_hgnc_tx_to_clinical = collections.defaultdict(list) # if multiple clinical?

for idx, row in new_gt.iterrows():
  hgnc = row['hgnc'].strip()
  tx = row['transcript'].strip()
  clinical = row['clinical'].strip() == 'clinical_transcript'

  if clinical:
    new_hgnc_tx_to_clinical[hgnc].append(tx)

  new_hgnc_to_transcripts[hgnc].append(tx)

In [None]:
# old and new g2t comparison

missing_in_new = set(old_hgnc_to_transcripts.keys()) - set(new_hgnc_to_transcripts.keys())
added_in_new = set(new_hgnc_to_transcripts.keys()) - set(old_hgnc_to_transcripts.keys())

if missing_in_new:
  print(f'Missing HGNCs in new genepanel: {missing_in_new}')
  print(f'Number of HGNC missing in new genepanel that were in old genepanel: {len(missing_in_new)}')
else:
  print('No HGNCs in old genepanel that is removed in new genepanel')

if added_in_new:
  print(f'Added HGNCs in new genepanel: {added_in_new}')
  print(f'Number of new HGNC added in new genepanel that are not in old genepanel: {len(added_in_new)}')
else:
  print('No new added HGNC in new genepanel compared to old genepanel')

No HGNCs in old genepanel that is removed in new genepanel
No new added HGNC in new genepanel compared to old genepanel


In [None]:
# new genepanel file

new_genepanel = dx.DXFile('file-GgBG75Q433Gk4pY5qpxbgVyz').read() # make sure this version of genepanel match the g2t version

In [None]:
new_genepanel_df = pd.DataFrame([line.split('\t') for line in new_genepanel.split('\n')], columns=['Gemini Name', 'Panel','Gene', "Panel ID"])
new_genepanel_df.dropna(inplace=True)
new_genepanel_genes = new_genepanel_df['Gene'].values.tolist()

In [None]:
# genes in genepanel but not in g2t
diff = set(new_genepanel_genes) - set(new_hgnc_to_transcripts.keys())

if diff:
  print(f'{diff} genes are in new genepanels but not in new g2t')
  print(f'{len(diff)} genes in new genepanels but not in new g2t')
else:
  print('All genes in genepanels are present in g2t')

All genes in genepanels are present in g2t


In [None]:
# existing gene comparison

note = []

for hgnc, transcripts in old_hgnc_to_transcripts.items():
  if hgnc not in new_hgnc_to_transcripts:
    note.append([hgnc, 'this gene does not exist in new g2t.', None])
    continue

  comment = ""

  # compare between tx
  new_transcripts = new_hgnc_to_transcripts[hgnc]
  if set(transcripts) == set(new_transcripts):
    comment += "tx remain the same. "
  else:
    comment += "tx have changed! "

    added_tx = set(new_transcripts) - set(transcripts)
    if added_tx:
      comment += f"{added_tx} are newly added. "

    removed_tx = set(transcripts) - set(new_transcripts)
    if removed_tx:
      comment += f"{removed_tx} are removed. "

  # compare between clinical tx
  clinical_comment = ""

  old_clinical = old_hgnc_tx_to_clinical.get(hgnc, False)
  new_clinical = new_hgnc_tx_to_clinical.get(hgnc, False)

  if not old_clinical and not new_clinical:
    clinical_comment += "both old and new have no clinical transcript assigned. "
  elif old_clinical and not new_clinical:
    clinical_comment += f"old clinical is {old_clinical} but new clinical is not assigned. "
  elif not old_clinical and new_clinical:
    clinical_comment += f"old clinical is not assigned but new clinical is {new_clinical}. "
  else:
    # both have values. do comparison
    pass

  if old_clinical and new_clinical: # if both aren't False, then we can compare
    if set(old_clinical) == set(new_clinical):
      clinical_comment += "old and new clinical tx remain the same. "
    else:
      clinical_comment += f"clinical tx changed from {old_clinical} to {new_clinical}"


  note.append([hgnc, comment, clinical_comment, new_transcripts, transcripts, old_clinical, new_clinical])

In [None]:
pd.DataFrame(
    note,
    columns=[
        'hgnc',
        'comment',
        'clinical comment',
        'new tx',
        'old tx',
        'old clinical',
        'new clinical']).to_csv('g2t_comparison.csv', index=False) # save the df as csv