In [2]:
%matplotlib inline
from Bio import SeqIO    
import numpy as np
import pandas as pd    
import matplotlib.pyplot as plt
import os
import sys


In [3]:
def read_parsed_igblast_file(filename):
    '''Takes in a parsed IgBlastn file and returns a pandas Dataframe
    
    Parameters:
        filename - the name of the .txt file returned from the parse_igblast.py code.
    '''
    
    antibodies = pd.DataFrame(columns=['V_gene', 'J_gene', 'CDR3_seq', 'CD3_AA', 'CDR3_len'])
    
    with open(filename, 'r') as f:
        for line in f:
            label, row = parse_line(line)
            antibodies.loc[label] = row
            
    return antibodies
            

In [4]:
def parse_line(line):
    '''Takes a line from the parsed IgBlastn file and returns an ID string and an array 
    containing V gene, J gene, and CDR3 sequence, and CDR3 length.
    
    Parameters:
        line - the line being read from the txt file containing parsed antibody information.
    '''
    
    split_line = line.split()
    label = ''
    row = [None] * 5

    label = split_line[0]
    row[0] = split_line[2]
    row[1] = split_line[4]
    row[2] = split_line[9] + 'C'
    row[3] = translate(row[2])
    row[4] = len(row[3])
    
    return label, row

In [5]:
def translate(seq):
    '''Translates a nucleotide string to an AA string.
    
    Parameters:
        seq - the nucleotide string to be translated.
    '''
    
    codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    
    AA = ''
    for i in range(0,len(seq),3):
        if seq[i:i+3] in codon_table:
            AA += codon_table[seq[i:i+3]]
    return AA

In [44]:
SRR2150229_new = read_parsed_igblast_file('parsed_igblast.txt')

In [45]:
SRR2150229_new

Unnamed: 0,V_gene,J_gene,CDR3_seq,CD3_AA,CDR3_len
SRR2150229.1,IGHV3-33*01,IGHJ6*03,GCGAGAGGGGGAGGTTATGAGGCATACTACTACTACTACTACATGG...,ARGGGYEAYYYYYYMDV,17.0
SRR2150229.2,IGHV1-69*01,IGHJ4*02,GCGAGAGACTGTAGTAGTACCGATTGCGCCAGAGAC,ARDCSSTDCARD,12.0
SRR2150229.3,IGHV2-5*02,IGHJ4*02,GCACACAGACGTGGGCGGTACGGTGACTACGCTGGCTTTGACTAC,AHRRGRYGDYAGFDY,15.0
SRR2150229.4,IGHV4-31*03,IGHJ1*01,GCGAGAGCGATTCGGGATAGCAGCAGCTGGTACGGCCGATACTTCC...,ARAIRDSSSWYGRYFQH,17.0
SRR2150229.5,IGHV3-23*01,IGHJ4*02,GCAGGGGGCCCCGACTACGGTGTCGACTACTTTGACTTC,AGGPDYGVDYFDF,13.0
SRR2150229.6,IGHV5-51*01,IGHJ5*02,GCGAGAAGGGGTATAACAGGGGACTGGTTCGACCCC,ARRGITGDWFDP,12.0
SRR2150229.7,IGHV1-18*01,IGHJ6*03,GCGAGAGCAATCGGATATTGTAGTGATACCGCCTGCCATTCCTACT...,ARAIGYCSDTACHSYYYYYMDV,22.0
SRR2150229.8,IGHV4-34*01,IGHJ4*02,GCGAGAGCCCGTAGCGGAGACGGTGGTAACTCCGGGTACTACTTTG...,ARARSGDGGNSGYYFDY,17.0
SRR2150229.9,IGHV1-69*13,IGHJ6*03,GCGAGAGAGGGGGAAAAAATTGGAGCTATTATTCCGACCACTACTA...,AREGEKIGAIIPTTTTTTWT,20.0
SRR2150229.10,IGHV1-8*01,IGHJ6*02,GCGAGAGGAGTTGGAGGTAGTGGTTATTACATATACTACTACTACG...,ARGVGGSGYYIYYYYGMDV,19.0


In [46]:
count = 0
for i,row in enumerate(SRR2150229_new.iterrows()):
    if '_' in SRR2150229_new.iloc[i][3] or len(SRR2150229_new.iloc[i][3]) <= 5:
        count += 1

print (count)
print (len(SRR2150229_new.index))

600
6386


In [14]:
SRR2150329 = read_parsed_igblast_file('parsed_igblast.txt')


In [9]:
SRR2150126

Unnamed: 0,V_gene,J_gene,CDR3_seq,CD3_AA,CDR3_len
SRR2150126.1,IGHV2-5*02,IGHJ4*02,GCACACAACCCCCATAGTGGACTCCTTGACTAC,AHNPHSGLLDY,11.0
SRR2150126.2,IGHV2-5*02,IGHJ4*02,GCACACAACCCCCATAGTGGACTCCTTGACTAC,AHNPHSGLLDY,11.0
SRR2150126.3,IGHV4-31*03,IGHJ3*02,GCGAGAGAGAACCCCCCGGGTATAGCAGTGGCTGGTACCCGTGCTT...,ARENPPGIAVAGTRAFDI,18.0
SRR2150126.4,IGHV3-30*18,IGHJ6*02,GCGAAAGATCTCTTTTCGGGAGTGGGACCCAAAAACGGCTACTACT...,AKDLFSGVGPKNGYYYYGMDV,21.0
SRR2150126.5,IGHV4-31*03,IGHJ3*02,GCGAGAGAGAACCCCCCGGGTATAGCAGTGGCTGGTACCCGTGCTT...,ARENPPGIAVAGTRAFDI,18.0
SRR2150126.6,IGHV4-59*01,IGHJ6*03,GCGAGAGTGAGTGGCTACGATTTTTGGAGTGGTTATGCACCCAACT...,ARVSGYDFWSGYAPNYYYYYMDV,23.0
SRR2150126.7,IGHV4-39*01,IGHJ3*02,GCGAGATTTGTGGTGGTGAAGGCTTTTTGATACC,ARFVVVKAF_Y,11.0
SRR2150126.8,IGHV1-18*01,IGHJ5*02,GCGAGGGGGGTACAACTGGAACGACATTGGGTTGGTTGGTTCGACCCC,ARGVQLERHWVGWFDP,16.0
SRR2150126.9,IGHV3-11*01,IGHJ5*02,GCGAGAGAGAGGACTACCGAGTTCGACCCC,ARERTTEFDP,10.0
SRR2150126.10,IGHV4-34*01,IGHJ4*02,GCGATCACCTTGGCCTATTGTAGTAGTTCCAGTTGCGTTGACTCC,AITLAYCSSSSCVDS,15.0


In [13]:
count = 0
for i,row in enumerate(SRR2150126.iterrows()):
    if '_' in SRR2150126.iloc[i][3]:
        count += 1

print (count)
print (len(SRR2150126.index))

244
2284


In [11]:
SRR2150229

Unnamed: 0,V_gene,J_gene,CDR3_seq,CD3_AA,CDR3_len
SRR2150229.1,IGHV3-33*01,IGHJ6*03,GCGAGAGGGGGAGGTTATGAGGCATACTACTACTACTACTACATGG...,ARGGGYEAYYYYYYMDV,17.0
SRR2150229.2,IGHV1-69*01,IGHJ4*02,GCGAGAGACTGTAGTAGTACCGATTGCGCCAGAGAC,ARDCSSTDCARD,12.0
SRR2150229.3,IGHV2-5*02,IGHJ4*02,GCACACAGACGTGGGCGGTACGGTGACTACGCTGGCTTTGACTAC,AHRRGRYGDYAGFDY,15.0
SRR2150229.4,IGHV4-31*03,IGHJ1*01,GCGAGAGCGATTCGGGATAGCAGCAGCTGGTACGGCCGATACTTCC...,ARAIRDSSSWYGRYFQH,17.0
SRR2150229.5,IGHV3-23*01,IGHJ4*02,GCAGGGGGCCCCGACTACGGTGTCGACTACTTTGACTTC,AGGPDYGVDYFDF,13.0
SRR2150229.6,IGHV5-51*01,IGHJ5*02,GCGAGAAGGGGTATAACAGGGGACTGGTTCGACCCC,ARRGITGDWFDP,12.0
SRR2150229.7,IGHV1-18*01,IGHJ6*03,GCGAGAGCAATCGGATATTGTAGTGATACCGCCTGCCATTCCTACT...,ARAIGYCSDTACHSYYYYYMDV,22.0
SRR2150229.8,IGHV4-34*01,IGHJ4*02,GCGAGAGCCCGTAGCGGAGACGGTGGTAACTCCGGGTACTACTTTG...,ARARSGDGGNSGYYFDY,17.0
SRR2150229.9,IGHV1-69*13,IGHJ6*03,GCGAGAGAGGGGGAAAAAATTGGAGCTATTATTCCGACCACTACTA...,AREGEKIGAIIPTTTTTTWT,20.0
SRR2150229.10,IGHV1-8*01,IGHJ6*02,GCGAGAGGAGTTGGAGGTAGTGGTTATTACATATACTACTACTACG...,ARGVGGSGYYIYYYYGMDV,19.0


In [12]:
count = 0
for i,row in enumerate(SRR2150229.iterrows()):
    if '_' in SRR2150229.iloc[i][3]:
        count += 1

print (count)
print (len(SRR2150229.index))

1413
5844


In [15]:
SRR2150329

Unnamed: 0,V_gene,J_gene,CDR3_seq,CD3_AA,CDR3_len
SRR2150329.1,IGHV3-23*01,IGHJ4*01,GCGAAAGATCCTCCGTTGTGGCGAGGAGATCCCTACTACTTTGACTTC,AKDPPLWRGDPYYFDF,16.0
SRR2150329.2,IGHV4-39*01,IGHJ4*02,GCGAGACATCCCTCCTTCTCTAACTACGGCCAC,ARHPSFSNYGH,11.0
SRR2150329.3,IGHV3-23*01,IGHJ5*02,GCGAAAGATCTCATCGGATGGTTCGACCCC,AKDLIGWFDP,10.0
SRR2150329.4,IGHV2-26*01,IGHJ3*02,GCACGGATAGAGCAGCAGCTGGTACCTGATGCTTTTGATATC,ARIEQQLVPDAFDI,14.0
SRR2150329.5,IGHV4-34*01,IGHJ6*02,GCGAGAGTTCGACCTTTGGGGAAGTACCAGCTGCTAGAGTACTACT...,ARVRPLGKYQLLEYYYYGMDV,21.0
SRR2150329.6,IGHV3-33*01,IGHJ4*02,GCGAGAGTGGGGTATATACCCTGTGGCTGGTACTAAAGGACTAC,ARVGYIPCGWY_RT,14.0
SRR2150329.7,IGHV4-59*01,IGHJ4*02,GCGAGAGGGGACATTGTTTTGGAGTGGTTATCCGCTTACTTTGACTAC,ARGDIVLEWLSAYFDY,16.0
SRR2150329.8,IGHV4-59*01,IGHJ4*02,GCGAGAGGGGACATTGTTTTGGAGTGGTTATCCGCTTACTTTGACTAC,ARGDIVLEWLSAYFDY,16.0
SRR2150329.9,IGHV3-74*01,IGHJ5*02,GCAAGAGATCGATGGCAAGGTGTAGGCTGGTTCGACCCC,ARDRWQGVGWFDP,13.0
SRR2150329.10,IGHV3-23*01,IGHJ3*02,GCGAAAGGAATCCAGCAGCTGGCTGCTGGGGCTTTTGATATC,AKGIQQLAAGAFDI,14.0


In [16]:
count = 0
for i,row in enumerate(SRR2150329.iterrows()):
    if '_' in SRR2150329.iloc[i][3]:
        count += 1

print (count)
print (len(SRR2150329.index))

1274
5605


In [19]:
patient_148 = SRR2150126
patient_148 = patient_148.append(SRR2150229)
patient_148 = patient_148.append(SRR2150329)


In [25]:
count = 0
for i,row in enumerate(patient_148.iterrows()):
    if '_' in patient_148.iloc[i][3] or len(patient_148.iloc[i][3]) <= 5:
        count += 1

print (count)
print (len(patient_148.index))

3294
13733


In [20]:
patient_148

Unnamed: 0,V_gene,J_gene,CDR3_seq,CD3_AA,CDR3_len
SRR2150126.1,IGHV2-5*02,IGHJ4*02,GCACACAACCCCCATAGTGGACTCCTTGACTAC,AHNPHSGLLDY,11.0
SRR2150126.2,IGHV2-5*02,IGHJ4*02,GCACACAACCCCCATAGTGGACTCCTTGACTAC,AHNPHSGLLDY,11.0
SRR2150126.3,IGHV4-31*03,IGHJ3*02,GCGAGAGAGAACCCCCCGGGTATAGCAGTGGCTGGTACCCGTGCTT...,ARENPPGIAVAGTRAFDI,18.0
SRR2150126.4,IGHV3-30*18,IGHJ6*02,GCGAAAGATCTCTTTTCGGGAGTGGGACCCAAAAACGGCTACTACT...,AKDLFSGVGPKNGYYYYGMDV,21.0
SRR2150126.5,IGHV4-31*03,IGHJ3*02,GCGAGAGAGAACCCCCCGGGTATAGCAGTGGCTGGTACCCGTGCTT...,ARENPPGIAVAGTRAFDI,18.0
SRR2150126.6,IGHV4-59*01,IGHJ6*03,GCGAGAGTGAGTGGCTACGATTTTTGGAGTGGTTATGCACCCAACT...,ARVSGYDFWSGYAPNYYYYYMDV,23.0
SRR2150126.7,IGHV4-39*01,IGHJ3*02,GCGAGATTTGTGGTGGTGAAGGCTTTTTGATACC,ARFVVVKAF_Y,11.0
SRR2150126.8,IGHV1-18*01,IGHJ5*02,GCGAGGGGGGTACAACTGGAACGACATTGGGTTGGTTGGTTCGACCCC,ARGVQLERHWVGWFDP,16.0
SRR2150126.9,IGHV3-11*01,IGHJ5*02,GCGAGAGAGAGGACTACCGAGTTCGACCCC,ARERTTEFDP,10.0
SRR2150126.10,IGHV4-34*01,IGHJ4*02,GCGATCACCTTGGCCTATTGTAGTAGTTCCAGTTGCGTTGACTCC,AITLAYCSSSSCVDS,15.0


In [42]:
def check_clonality(patient):
    CDR3s = {}
    for i,row in enumerate(patient.iterrows()):
        key = patient.iloc[i][3]
        if key in CDR3s:
            CDR3s[key] += 1
        else:
            CDR3s[key] = 1
    
    for key in CDR3s:
        if CDR3s[key] > 1:
            print (key, " ", CDR3s[key])
    return CDR3s

In [43]:
check_clonality(patient_148)

   21
ARHAGGKGYSLNWFDP   2
AKDRGLRLGELSVSGVFYI   2
ARGRRSSGWDRGYYYYGMDV   2
L_S   7
ARAPITIFGTFDY   2
AKAQGGGNPFTNWFDP   2
ARDFLLGGSGIMDV   8
ARENRVSGWCDY   2
AQYSSGWY   3
ATAGGGSGWPSFWYYYMDV   5
LTIFGVVP   4
ARDLNSRPMSKKAFDI   2
SALH_VCVVYVTTTTTNG_DPLQPLP   2
ARCPIVLVPAAIRGGYYYYYYMDV   2
AREVRGLYGFGMTTGST   2
TTPVRLLRGVIQDY   4
_GTGCSWCFHYD_FPQSTPAPS   4
TTTGPSLPFCGGDCYDAFDI   3
ARHETVGEWLFRLPDY   8
AKRLPGDNTIAPGVYFDY   2
ARHPLRGQWLVRGGYYYYGMDV   3
ARVQGDDFWTPNQRFYYYMDV   6
TVL_S   2
VRQGDFTSGFFYFYYYMDV   3
ARDLEGLQFGGDSGYGMDV   2
ARDPPYYDILTGYYNNLDY   3
ARDWFGEH   4
ARIKSSGWFMDV   2
DKYQLLLGSLTTTTTWT   7
ARGDYGGPNGAFDI   5
ARRGRWFGELSGVGGYFDY   4
ASYYCTSISCSVVDY   5
ARERVADERGSGYYYYGMDV   3
ALGGYDDNGVQRPDLDY   2
AKDSAYCSDLSCPRGWSYGGFDS   2
ARLRADYGDYGSYYMDV   3
TNGDCRLCS   3
ARLEADTALRRYYYGMDV   2
ARGGRSSGWYYFDY   3
AWKDYYGSGNYHFDY   3
ARSLWYQLPVFDY   3
TLEGRVVVGASTMIDFPNPLQPLP   2
ARHGHYYDSSGYFSGFDY   4
VAAGGGSGWPSFWYLYMDL   2
ARDLTPKGIAAA_GPLTTGST   6
ASSYNYDFWSGYKEGAFDI   2
ARHCTH

{'': 21,
 'ARDGSYCNSTSCPSVAFDV': 1,
 'ARHAGGKGYSLNWFDP': 2,
 'AGVYYDSSGYYMYAFDI': 1,
 'TKSA_CVLPPLPLISETHSSPF': 1,
 'AKDRGLRLGELSVSGVFYI': 2,
 'ARVKYDSSGYYPLFDY': 1,
 'CSWCSHCNRYTQSTPVPS': 1,
 'SLPYQRQG_SLPSTQALV': 1,
 'ARHSASYGDHDY': 1,
 'ARGRRSSGWDRGYYYYGMDV': 2,
 'VRGNGDSALEVLCVLC_TKHRDDPSHPFKPLS': 1,
 'L_S': 7,
 '_LDS_GTGCSRCSHCNRCTQSTPGLP': 1,
 'PGASVHSLCYHCKR_SIPSTQALV': 1,
 'ARGEWLRPTSGYWAVGNCAGST': 1,
 'VKPWAYVGSQCWC_SIPSTQALV': 1,
 'ARAPITIFGTFDY': 2,
 'YMVLLLLLMYETHSSPF': 1,
 'PGTSVHSLCYHC_GSSIPSTQALV': 1,
 'AKAQGGGNPFTNWFDP': 2,
 'ARDFLLGGSGIMDV': 8,
 'STLEGGL_LVLPL__IYPIHSSPF': 1,
 'ARDRVEKIDEDYYDSSGDPTY': 1,
 'ARAENTAIPGGLLLLLHGR': 1,
 'VRDIFGVVIIMPEEGVY': 1,
 'ARGESYMGIPYFDS': 1,
 'ARENRVSGWCDY': 2,
 'AQYSSGWY': 3,
 'TCLLIW_LDS_GTGCSRCSHCNRCTQSTPAF': 1,
 'PSRSLRSMCYHHYH__LRPTPAPS': 1,
 'ARGVGDLLLLHGR': 1,
 'TTTTNG_DPLQPLP': 1,
 'ATAGGGSGWPSFWYYYMDV': 5,
 'AREPPMVRGPWLPETKYFQH': 1,
 'MVLLLLLMYETHSSPS': 1,
 'LTIFGVVP': 4,
 'LPWNFCEYFTFPLPALIHPIHSSLC': 1,
 'ARDLNSRPMSKKAFDI'