In [1]:
import argparse
import tensorflow as tf
import os
import numpy as np
import random
from numba import cuda
import pandas as pd
from tensorflow import keras
from tensorflow.keras import backend as K
from autoencoder.cVAE import Sampling1, CenterLossLayer, amino_onehot_encoding
from collections import Counter, defaultdict
import argparse
import torch
from pyseat.SEAT import SEAT
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

2.9.0 2.9.0


## Load the prepared TCR file and RNA file

### TCR file

1. Two columns are necessary: "barcode" and "cdr3", where cdr3 refers to the beta chain. 
2. Please filter out all the cdr3 longer than 30. Otherwise the cdr3s longer than 30 can not be addressed well by our encoder-classifier.
2. Other columns can be specified by users and they will not be used in TCRclub.
3. If the file contains cells from multiple samples, you should include a column labelled "sample" to indicate the source of each T cell.



In [2]:
tcr_file = pd.read_csv("./example_data/processed_tcr.csv", sep = ',', index_col='barcode')
tcr_file.head()

Unnamed: 0_level_0,cdr3,raw_clonotype_id,v_gene,j_gene,cdr3_nt,sample
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bcc.su001.post.tcell_AAACCTGAGGACATTA,CASSVEIGIGYTF,clonotype6,TRBV9,TRBJ1-2,TGTGCCAGCAGCGTAGAAATAGGGATCGGCTACACCTTC,su001.post
bcc.su001.post.tcell_AAACCTGCACGCATCG,CASSLSGGSSYNEQFF,clonotype5,TRBV7-2,TRBJ2-1,TGTGCCAGCAGCCTTAGCGGGGGGAGCTCCTACAATGAGCAGTTCTTC,su001.post
bcc.su001.post.tcell_AAACCTGCATGCCTTC,CAISELGGATRTGELFF,clonotype162,TRBV10-3,TRBJ2-2,TGTGCCATCAGTGAGTTAGGAGGGGCCACAAGGACCGGGGAGCTGT...,su001.post
bcc.su001.post.tcell_AAACCTGCATTGCGGC,CSGRASGGSVQETQYF,clonotype25,TRBV29-1,TRBJ2-5,TGCAGCGGTAGGGCTAGCGGGGGGAGTGTCCAAGAGACCCAGTACTTC,su001.post
bcc.su001.post.tcell_AAACCTGTCACCCGAG,CASSETSGGPWDEQFF,clonotype59,TRBV6-1,TRBJ2-1,TGTGCCAGCAGTGAAACTAGCGGGGGGCCCTGGGATGAGCAGTTCTTC,su001.post


In [3]:
tcr_file[tcr_file['cdr3'].str.len() > 30]

Unnamed: 0_level_0,cdr3,raw_clonotype_id,v_gene,j_gene,cdr3_nt,sample
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


### RNA file

1. Each row corresponds to a T cell, and the columns (except the first column) correspond to genes. The first column should be named as "barcode".
2. We suggest using the top 10% of highly expressed genes extracted from the original sc-RNA expression file. Users can select the input genes according to their own criteria.

In [4]:
rna_file = pd.read_csv("./example_data/processed_rna.csv", sep = ',', index_col='barcode')
rna_file.head()

Unnamed: 0_level_0,HES4,ISG15,TNFRSF18,TNFRSF4,RP11-345P4.7,PLEKHG5,TNFRSF9,TNFRSF1B,RP4-798A10.2,ID3,...,IGLV1-44,LIF,PVALB,BAIAP2L2,RP3-508I15.18,SCUBE1,PHF21B,CH507-513H4.5,AP000233.4,LINC00158
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc.su001.post.tcell_AAACCTGAGGACATTA,0.0,1.19711,2.843382,3.357818,0.0,0.0,1.19711,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bcc.su001.post.tcell_AAACCTGCACGCATCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.815051,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bcc.su001.post.tcell_AAACCTGCATGCCTTC,0.0,1.749696,2.869884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bcc.su001.post.tcell_AAACCTGCATTGCGGC,0.0,1.279837,2.173406,2.173406,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bcc.su001.post.tcell_AAACCTGTCACCCGAG,0.0,2.033181,3.040468,2.974884,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Notice

The order of T cells should be the same in both files.

In [5]:
tcr_file.index.tolist() == rna_file.index.tolist()

True

## Run TCRclub

### 1. T cells from a sample/patient

T cells are from the same sample or patient, or you don't need to discriminate their sources.

In [5]:
os.system("python3 TCRclub.py --tcr_file ./example_data/processed_tcr.csv --rna_file ./example_data/processed_rna.csv")

2.9.0 2.9.0
TCR embeddings are prepared.
Starting clustering
Start training 1 times.
Finish training 1 times.
Start training 2 times.
Finish training 2 times.
Start training 3 times.
Finish training 3 times.
Start training 4 times.
Finish training 4 times.
Start training 5 times.
Finish training 5 times.
Start training 6 times.
Finish training 6 times.
Start training 7 times.
Finish training 7 times.
Start training 8 times.
Finish training 8 times.
Start training 9 times.
Finish training 9 times.
Start training 10 times.
Finish training 10 times.
Start training 11 times.
Finish training 11 times.
Start training 12 times.
Finish training 12 times.
Start training 13 times.
Finish training 13 times.
Start training 14 times.
Finish training 14 times.
Start training 15 times.
Finish training 15 times.
Start training 16 times.
Finish training 16 times.
Start training 17 times.
Finish training 17 times.
Start training 18 times.
Finish training 18 times.
Start training 19 times.
Finish trainin

0

### 2. T cells from multiple samples/patients

If you want to discriminate the sources of these T cells, please add the argument "--multiple_sample"

In [5]:
os.system("python3 TCRclub.py --tcr_file ./example_data/processed_tcr.csv --rna_file ./example_data/processed_rna.csv --multiple_sample")

2.6.0 2.6.0
please note that multiple_sample is selected.
TCR embeddings are prepared.
Starting clustering
Start training 1 times.
Finish training 1 times.
Start training 2 times.
Finish training 2 times.
Start training 3 times.
Finish training 3 times.
Start training 4 times.
Finish training 4 times.
Start training 5 times.
Finish training 5 times.
Start training 6 times.
Finish training 6 times.
Start training 7 times.
Finish training 7 times.
Start training 8 times.
Finish training 8 times.
Start training 9 times.
Finish training 9 times.
Start training 10 times.
Finish training 10 times.
Start training 11 times.
Finish training 11 times.
Start training 12 times.
Finish training 12 times.
Start training 13 times.
Finish training 13 times.
Start training 14 times.
Finish training 14 times.
Start training 15 times.
Finish training 15 times.
Start training 16 times.
Finish training 16 times.
Start training 17 times.
Finish training 17 times.
Start training 18 times.
Finish training 18 

0

### 3. Fixed initialization option

Previous exacutation uses the default initialization for parameters -- the randomness initialization. As stated in our paper, we provided the fixed initialization for users. Users can add the argument "--fixed_initialization" to open it. Please note that when you choose the option, you should also specify the argument "--repeat_times" as 1 because there is no need to run multiple times when using fixed initialization.

In [8]:
os.system("python3 TCRclub.py --tcr_file ./example_data/processed_tcr.csv --rna_file ./example_data/processed_rna.csv --fixed_initialization --repeat_times 1")

2.6.0 2.6.0
TCR embeddings are prepared.
Starting clustering
Initialization of C is fixed!
Start training 1 times.
Finish training 1 times.
Directly generate results because initialization is fixed.
Output file is ready.


0

## Results

The T-cell clubs will be saved in the "consensus_result.csv" file within the output directory specified by the out parameter. 

The output file will create a new column named "club" in the right of input TCR file, where T cells with the same club ID are considered to belong to the same club.

In [8]:
result = pd.read_csv("./outputs/consensus_result.csv", index_col="barcode")
result.head()

Unnamed: 0_level_0,cdr3,raw_clonotype_id,v_gene,j_gene,cdr3_nt,sample,club
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bcc.su001.post.tcell_AAACCTGAGGACATTA,CASSVEIGIGYTF,clonotype6,TRBV9,TRBJ1-2,TGTGCCAGCAGCGTAGAAATAGGGATCGGCTACACCTTC,su001.post,169.0
bcc.su001.post.tcell_AAACCTGCACGCATCG,CASSLSGGSSYNEQFF,clonotype5,TRBV7-2,TRBJ2-1,TGTGCCAGCAGCCTTAGCGGGGGGAGCTCCTACAATGAGCAGTTCTTC,su001.post,32.0
bcc.su001.post.tcell_AAACCTGCATGCCTTC,CAISELGGATRTGELFF,clonotype162,TRBV10-3,TRBJ2-2,TGTGCCATCAGTGAGTTAGGAGGGGCCACAAGGACCGGGGAGCTGT...,su001.post,29.0
bcc.su001.post.tcell_AAACCTGCATTGCGGC,CSGRASGGSVQETQYF,clonotype25,TRBV29-1,TRBJ2-5,TGCAGCGGTAGGGCTAGCGGGGGGAGTGTCCAAGAGACCCAGTACTTC,su001.post,235.0
bcc.su001.post.tcell_AAACCTGTCACCCGAG,CASSETSGGPWDEQFF,clonotype59,TRBV6-1,TRBJ2-1,TGTGCCAGCAGTGAAACTAGCGGGGGGCCCTGGGATGAGCAGTTCTTC,su001.post,144.0
