In [1]:
import os

import numpy as np
import pandas as pd
from collections import Counter
import functools

In [2]:
seq_data_root = "sample_sequences"

In [3]:
groups = os.listdir(seq_data_root)
groups

['FU', 'HD', 'BL']

In [4]:
cols = ['cdr3aa', 'v', 'count', 'sample']
data = pd.DataFrame(columns=cols)
data

Unnamed: 0,cdr3aa,v,count,sample


In [5]:
voc_data = pd.DataFrame()

In [6]:
for group in groups:
    temp_dir = os.path.join(seq_data_root, group)
    
    for file in os.listdir(temp_dir):
        sample_id = file.split('.')[1]
        temp_df = pd.read_csv(os.path.join(temp_dir, file), delimiter='\t')
        temp_df['sample'] = sample_id
        data = data.append(temp_df, ignore_index=True)
        
        sequences = list(zip(temp_df['cdr3aa'], temp_df['count']))
        temp_voc = pd.DataFrame({'sample_id': sample_id, 'seq': [sequences]})
        voc_data = voc_data.append(temp_voc, ignore_index=True)

In [7]:
voc_data

Unnamed: 0,sample_id,seq
0,1231_FU_17-8-TCRD_S64_L001_R1,"[(CACDTVGGNTDKLIF, 538), (CACDQLTGGYAAQLFF, 34..."
1,1056_2659_SA74_S74_L001_R1,"[(CACDVLKSSWDTRQMFF, 5755), (CACDPLGDHTDKLIF, ..."
2,1124_FU_TCRD_S4_R1,"[(CACDTLGDTGKLIF, 14744), (CACDTIASGISSWDTRQMF..."
3,1120_FU-d_S52_L001_R1,"[(CALGERRWGIRYTDKLIF, 6489), (CACDSIVLGSQGSWDT..."
4,3001_FU_11-6-TCRD_S48_L001_R1,"[(CACDVLSSILGDSGKLIF, 25941), (CACDIVTGGLDHGAS..."
...,...,...
145,3020_BL_12-3-TCRD_S49_L001_R1,"[(CACDTVTPGANTDKLIF, 9393), (CACDSVLGTLTAQLFF,..."
146,1013_2565_SA82_S82_L001_R1,"[(CACDRLLGDNADKLIF, 16607), (CARVRLGDTTWDTRQMF..."
147,1121_BL-d_S91_L001_R1,"[(CACDTVSVGIRVTDKLIF, 11465), (CACDLLGALTDKLIF..."
148,1054_2630_SA69_S69_L001_R1,"[(CAFPSWGIGTDKLIF, 11314), (CALGVASYQEDIGLIF, ..."


In [17]:
data

Unnamed: 0,cdr3aa,v,count,sample,freq
0,CACDTVGGNTDKLIF,TRDV2,538,1231_FU_17-8-TCRD_S64_L001_R1,0.334785
1,CACDQLTGGYAAQLFF,TRDV2,346,1231_FU_17-8-TCRD_S64_L001_R1,0.215308
2,CACDTIPPGGYTDKLIF,TRDV2,106,1231_FU_17-8-TCRD_S64_L001_R1,0.065961
3,CACDNLGAYTDKLIF,TRDV2,60,1231_FU_17-8-TCRD_S64_L001_R1,0.037337
4,CACDRLGDQGTDKLIF,TRDV2,50,1231_FU_17-8-TCRD_S64_L001_R1,0.031114
...,...,...,...,...,...
43873,CALGELKLEGGALLLSGGPEYTDKLIF,TRDV1,11,1122_BL_TCRD_S2_R1,0.000118
43874,CACDGVGDTHDKLIF,TRDV2,10,1122_BL_TCRD_S2_R1,0.000107
43875,CACDTVGLPKPTDKLIF,TRDV2,10,1122_BL_TCRD_S2_R1,0.000107
43876,CACDTVLRDSSWDTRQMFF,TRDV2,10,1122_BL_TCRD_S2_R1,0.000107


In [29]:
vocabulary = data.groupby('cdr3aa')['count'].sum().reset_index()

In [30]:
vocabulary

Unnamed: 0,cdr3aa,count
0,CAAALDKLIF,228
1,CAAAPLPSVGGHTDKLIF,15
2,CAAAPSSPVGFQDTNTDKLIF,410
3,CAAAQLGVADKLIF,56
4,CAAARYTRGQYTDKLIF,95
...,...,...
26465,YAGRQLGASMYTDKLIF,8
26466,YALGDTPRTLQGYTDKLIF,2
26467,YALGELAPKIATPWTDKLIF,6
26468,YALGELRGIQDTDKLIF,12


In [31]:
word_dict = {}
item_id = 1

for word in vocabulary.cdr3aa:
    word_dict[word] = item_id
    item_id += 1

In [32]:
word_dict

{'CAAALDKLIF': 1,
 'CAAAPLPSVGGHTDKLIF': 2,
 'CAAAPSSPVGFQDTNTDKLIF': 3,
 'CAAAQLGVADKLIF': 4,
 'CAAARYTRGQYTDKLIF': 5,
 'CAAAVSTDKLIF': 6,
 'CAADRGGIEDKLIF': 7,
 'CAADTVLLGDTGDKLIF': 8,
 'CAAEGVVSSWDTRQMFF': 9,
 'CAAELHFLRRLWGISPGSTDKLIF': 10,
 'CAAEPFLAEGELAKLIF': 11,
 'CAAERFGGALTAQLFF': 12,
 'CAAFGGLSRGKYTDKLIF': 13,
 'CAAFWPAGGYRVTDKLIF': 14,
 'CAAGAGGRLYTDKLIF': 15,
 'CAAGAPPGDKLIF': 16,
 'CAAGDSYRGTTFTDKLIF': 17,
 'CAAGEGLTGGYLVYTDKLIF': 18,
 'CAAGFPRWGKYTDKLIF': 19,
 'CAAGFYGWGSGRLYTDKLIF': 20,
 'CAAGLLWYGDKLIF': 21,
 'CAAGLPTGYEADKLIF': 22,
 'CAAGNWGIPYTDKLIF': 23,
 'CAAGPFLAEGELAKFIF': 24,
 'CAAGPFLAEGELAKLIF': 25,
 'CAAGPFLAEGELAKLTF': 26,
 'CAAGPFLAEGELAKLVF': 27,
 'CAAGPFLAEGEPAKLIF': 28,
 'CAAGPSPGTGGSDKLIF': 29,
 'CAAGQGANWGQYTDKLIF': 30,
 'CAAGQGANWGQYTDKPIF': 31,
 'CAAGRASTDKLIF': 32,
 'CAAGRVRDAADKLIF': 33,
 'CAAGTGGALYTDKLIF': 34,
 'CAAGTGGGPAFTDKLIF': 35,
 'CAAGVYLGLGILPPRRQLFF': 36,
 'CAAGVYTDKLIF': 37,
 'CAAGWGGPTNKYTDKLIF': 38,
 'CAAHEMGVVSDKLIF': 39,
 'CAAHPGGET

In [59]:
inverted_idx = {}
for word, item_id in word_dict.items():
    inverted_idx[item_id] = list(data[data['cdr3aa'] == word][['sample', 'count']] \
                                 .itertuples(index=False, name=None))

In [60]:
inverted_idx

{1: [('1168_FU-d_S73_L001_R2', 3),
  ('1084_FU-d_S71_L001_R1', 63),
  ('1153_FU_TCRD_S7_R1', 41),
  ('1153_BL-d_S69_L001_R1', 72),
  ('1008_BL-d_S70_L001_R1', 49)],
 2: [('14-2-TCRD_S52_L001_R1', 15)],
 3: [('1082_2697_SA92_S92_L001_R1', 410)],
 4: [('1136_BL_8-7-TCRD_S42_L001_R1', 56)],
 5: [('3-4-TCRD_S32_L001_R1', 95)],
 6: [('2-2-TCRD_S22_L001_R1', 96)],
 7: [('14-2-TCRD_S52_L001_R1', 120)],
 8: [('1145_FU-d_S53_L001_R1', 259),
  ('1146_FU-d_S95_L001_R2', 18),
  ('1152_FU-d_S94_L001_R1', 14),
  ('1145_BL-d_S67_L001_R1', 139),
  ('1120_BL-d_S56_L001_R1', 56),
  ('1149_BL-d_S68_L001_R1', 26),
  ('1112_BL-d_S66_L001_R1', 10),
  ('1008_BL-d_S70_L001_R1', 3)],
 9: [('14-1-TCRD_S51_L001_R1', 59)],
 10: [('3-3-TCRD_S31_L001_R1', 1094)],
 11: [('1054_2630_SA69_S69_L001_R1', 162)],
 12: [('2-7-TCRD_S27_L001_R1', 10)],
 13: [('1160_FU_TCRD_S8_R1', 410)],
 14: [('1217_BL_16-7-TCRD_S58_L001_R1', 154)],
 15: [('1095_2738_SA88_S88_L001_R1', 192),
  ('1095_2718_SA95_S95_L001_R1', 796)],
 16: [('1

In [61]:
sum([x[1] for x in inverted_idx[word_dict['CAAALDKLIF']]])

[('1168_FU-d_S73_L001_R2', 3),
 ('1084_FU-d_S71_L001_R1', 63),
 ('1153_FU_TCRD_S7_R1', 41),
 ('1153_BL-d_S69_L001_R1', 72),
 ('1008_BL-d_S70_L001_R1', 49)]