In [97]:
import os

import numpy as np
import pandas as pd
from collections import Counter
import functools

In [98]:
seq_data_root = "sample_sequences"

In [99]:
groups = os.listdir(seq_data_root)
groups

['FU', 'HD', 'BL']

In [100]:
cols = ['cdr3aa', 'v', 'count', 'sample']
data = pd.DataFrame(columns=cols)
data

Unnamed: 0,cdr3aa,v,count,sample


In [166]:
voc_data = pd.DataFrame()

In [167]:
for group in groups:
    temp_dir = os.path.join(seq_data_root, group)
    
    for file in os.listdir(temp_dir):
        sample_id = file.split('.')[1]
        temp_df = pd.read_csv(os.path.join(temp_dir, file), delimiter='\t')
        temp_df['sample'] = sample_id
        data = data.append(temp_df, ignore_index=True)
        
        sequences = list(zip(temp_df['cdr3aa'], temp_df['count']))
        temp_voc = pd.DataFrame({'sample_id': sample_id, 'seq': [sequences]})
        voc_data = voc_data.append(temp_voc, ignore_index=True)

In [168]:
voc_data

Unnamed: 0,sample_id,seq
0,1231_FU_17-8-TCRD_S64_L001_R1,"[(CACDTVGGNTDKLIF, 538), (CACDQLTGGYAAQLFF, 34..."
1,1056_2659_SA74_S74_L001_R1,"[(CACDVLKSSWDTRQMFF, 5755), (CACDPLGDHTDKLIF, ..."
2,1124_FU_TCRD_S4_R1,"[(CACDTLGDTGKLIF, 14744), (CACDTIASGISSWDTRQMF..."
3,1120_FU-d_S52_L001_R1,"[(CALGERRWGIRYTDKLIF, 6489), (CACDSIVLGSQGSWDT..."
4,3001_FU_11-6-TCRD_S48_L001_R1,"[(CACDVLSSILGDSGKLIF, 25941), (CACDIVTGGLDHGAS..."
...,...,...
145,3020_BL_12-3-TCRD_S49_L001_R1,"[(CACDTVTPGANTDKLIF, 9393), (CACDSVLGTLTAQLFF,..."
146,1013_2565_SA82_S82_L001_R1,"[(CACDRLLGDNADKLIF, 16607), (CARVRLGDTTWDTRQMF..."
147,1121_BL-d_S91_L001_R1,"[(CACDTVSVGIRVTDKLIF, 11465), (CACDLLGALTDKLIF..."
148,1054_2630_SA69_S69_L001_R1,"[(CAFPSWGIGTDKLIF, 11314), (CALGVASYQEDIGLIF, ..."


In [180]:
vocabulary = Counter(functools.reduce(lambda x, y: x + y, voc_data.seq[2]))

In [181]:
vocabulary

Counter({'CACDTLGDTGKLIF': 1,
         14744: 1,
         'CACDTIASGISSWDTRQMFF': 1,
         7614: 1,
         'CACDLITGGYAFTDKLIF': 1,
         4277: 1,
         'CACDTVGPTGGDTDKLIF': 1,
         4078: 1,
         'CACDTVLGDSSWDTRQMFF': 1,
         3604: 1,
         'CACDKLGALRGTDKLIF': 1,
         3062: 1,
         'CACDTVGEPYTDKLIF': 1,
         2109: 1,
         'CACDNLGEYTDKLIF': 1,
         2010: 1,
         'CACDTVGGLLGDKSDKLIF': 1,
         1961: 1,
         'CACDYVLGDTHTDKLIF': 1,
         1806: 1,
         'CACDTVGDTPSSWDTRQMFF': 1,
         1784: 1,
         'CACVEVGVLTDKLIF': 1,
         1507: 1,
         'CACDTLGPNTDKLIF': 1,
         1496: 1,
         'CACDSVGLRGIEYTDKLIF': 1,
         1478: 1,
         'CACDTVMRDTSDKLIF': 1,
         1386: 1,
         'CACDTAPGPGAPSWDTRQMFF': 1,
         1285: 1,
         'CACDTVGAYTDKLIF': 1,
         1146: 1,
         'CACDTVGSLHTDKLIF': 1,
         960: 1,
         'CACDLLGGYGDTDKLIF': 1,
         933: 1,
         'CACDPLGGTYTDKLIF':

In [153]:
word_dict = {}
item_id = 1

for word in vocabulary.keys():
    word_dict[word] = item_id
    item_id += 1

In [154]:
word_dict

{'CACDTVGGNTDKLIF': 1,
 'CACDQLTGGYAAQLFF': 2,
 'CACDTIPPGGYTDKLIF': 3,
 'CACDNLGAYTDKLIF': 4,
 'CACDRLGDQGTDKLIF': 5,
 'CACDTVKLRGYGAHTDKLIF': 6,
 'CACDTVRGIRNTDKLIF': 7,
 'CACDTVGIQNTDKLIF': 8,
 'CACDSLFPTRPLTDKLIF': 9,
 'CACDDVLGRSPLMTDKLIF': 10,
 'CACDGLYPARGADSDKLIF': 11,
 'CACDRLGDKAELIF': 12,
 'CACDTVRRGIQPTTDKLIF': 13,
 'CACDPVLGDTINTDKLIF': 14,
 'CACDDLELAVLGDTHEYTDKLIF': 15,
 'CACDTVGIKDKLIF': 16,
 'CACDSVLGFGDTDKLIF': 17,
 'CACDTAGVPRTDKLIF': 18,
 'CACDALGDTDKLIF': 19,
 'CACDTVLGDANTDKLIF': 20,
 'CACDSGGPSSWDTRQMFF': 21,
 'CACDTLGTGGYPHTDKLIF': 22,
 'CACDTLTGDTDKLIF': 23,
 'CACDHLDHTDKLIF': 24,
 'CACDTIVLGDTELRGDKLIF': 25,
 'CACDTVLGDGLLIF': 26,
 'CACDVLGEKRVYTDKLIF': 27,
 'CACDSILGDSGTDKLIF': 28,
 'CACDTLPGGYGNTDKLIF': 29,
 'CALGDPRPSPRYWGNTDKLIF': 30,
 'CACDPLGGGLLQNTDKLIF': 31,
 'CACDSLPEVLGDPRGDKLIF': 32,
 'CACDSMGDIRSPWEADKLIF': 33,
 'CACDTAGGLSWDTRQMFF': 34,
 'CACDEMAYTDKLIF': 35,
 'CACDILGANRLIEDTDKLIF': 36,
 'CACDPVGKSSWDTRQMFF': 37,
 'CACDTVLHTDKLIF': 38,
 'CAFSSYIP

In [155]:
inverted_idx = {}
for word, item_id in word_dict.items():
    inverted_idx[item_id] = list(voc_data[voc_data.seq.apply(lambda row: word in row)].index)

In [156]:
inverted_idx

{1: [0, 5, 14, 34, 53, 57, 58, 75, 89, 91, 103, 104, 107, 130],
 2: [0, 107, 130],
 3: [0, 107],
 4: [0, 107],
 5: [0, 107],
 6: [0, 107],
 7: [0, 107],
 8: [0, 107],
 9: [0, 107],
 10: [0, 107],
 11: [0, 107],
 12: [0, 107],
 13: [0, 107],
 14: [0, 107],
 15: [0],
 16: [0, 107],
 17: [0],
 18: [0, 107],
 19: [0,
  6,
  14,
  21,
  38,
  39,
  41,
  56,
  57,
  64,
  70,
  80,
  84,
  85,
  92,
  93,
  100,
  108,
  111,
  115,
  119,
  122,
  126,
  127,
  130],
 20: [0, 116],
 21: [0],
 22: [0, 39, 42, 109, 125, 130],
 23: [0, 107],
 24: [0, 107],
 25: [0, 107],
 26: [0, 107],
 27: [0],
 28: [0],
 29: [0],
 30: [0, 107],
 31: [0, 42, 109, 128, 134],
 32: [0],
 33: [0, 107],
 34: [0, 29, 53, 54, 84, 107, 109, 130],
 35: [0, 107],
 36: [0],
 37: [0, 107],
 38: [0, 76, 87, 130, 132],
 39: [0, 107],
 40: [0],
 41: [0, 107],
 42: [0, 107],
 43: [0, 6, 65, 66, 81, 107, 117, 148],
 44: [0, 39, 42, 109, 130],
 45: [0, 66],
 46: [0],
 47: [0],
 48: [0, 134],
 49: [0, 132],
 50: [0, 107],
 51: