In [13]:
import pandas as pd
from clustcr import Clustering

# Import a file that contains at least a CDR3 column and a V gene column
data = pd.read_csv("../TCRs.csv")
# Initiate a Clustering object
clustering = Clustering()
# Include V gene information by setting the include_vgene parameter to True

In [17]:
filename = "../DeepCAT/TrainingData/TumorCDR3.txt"

with open(filename, 'r') as file:
    lines = file.readlines()  # Read in the lines of the file as a list of strings

array = []  # Create an empty array

for line in lines:
    item = line.strip()  # Strip any whitespace characters from the line
    array.append(item)  # Append the line to the array


In [37]:
filename = "../DeepCAT/TrainingData/NormalCDR3.txt"

with open(filename, 'r') as file:
    lines = file.readlines()  # Read in the lines of the file as a list of strings

normal = []  # Create an empty array

for line in lines:
    item = line.strip()  # Strip any whitespace characters from the line
    normal.append(item)  # Append the line to the array


In [38]:
import numpy as np
TCRs = np.concatenate((array, normal, data['sequence_aa']))

In [46]:
output = clustering.fit(TCRs)

Clustering 146778 TCRs using two-step approach.
Total time to run ClusTCR: 111.284s


In [40]:
output.cluster_df

Unnamed: 0,junction_aa,cluster
0,CASSIGEPDEQYF,0
1,CASSIGEPYEQYF,0
2,CASSLGEGYEQYF,0
3,CASSVGETYEQYF,0
4,CASSIGETYEQYF,0
...,...,...
64866,CASSGTGTVRTNEKLFF,6504
64867,CASSLGTGPSGANVLTF,6505
64868,CASSLGTGSSGANVLTF,6505
64869,CASSYDREGSTNEKLFF,6506


In [49]:
# Cancer

# Annotate the Individual TCRs by matching
output.clusters_df['cancer'] = "Not Cancer Associated"
output.clusters_df['stringentCancer'] = "No"
matching_indices = np.where(np.isin(output.clusters_df['junction_aa'], array))[0]
output.clusters_df['stringentCancer'][matching_indices] = "Yes"

# Annotate the entire cluster
matching_clusters_idx = np.where(np.isin(output.clusters_df['cluster'], np.unique(output.clusters_df['cluster'][matching_indices])))[0]
output.clusters_df['cancer'][matching_clusters_idx] = "Cancer"

In [42]:
output.clusters_df

Unnamed: 0,junction_aa,cluster,cancer
0,CASSIGEPDEQYF,0,Cancer
1,CASSIGEPYEQYF,0,Cancer
2,CASSLGEGYEQYF,0,Cancer
3,CASSVGETYEQYF,0,Cancer
4,CASSIGETYEQYF,0,Cancer
...,...,...,...
64866,CASSGTGTVRTNEKLFF,6504,Not Cancer Associated
64867,CASSLGTGPSGANVLTF,6505,Not Cancer Associated
64868,CASSLGTGSSGANVLTF,6505,Not Cancer Associated
64869,CASSYDREGSTNEKLFF,6506,Cancer


In [50]:
# Normal

output.clusters_df['normal'] = "AbsentNormal"
output.clusters_df['stringentNormal'] = "No"
matching_indices = np.where(np.isin(output.clusters_df['junction_aa'], normal))[0]
output.clusters_df['stringentNormal'][matching_indices] = "Yes"

# Annotate the entire cluster
matching_clusters_idx = np.where(np.isin(output.clusters_df['cluster'], np.unique(output.clusters_df['cluster'][matching_indices])))[0]
output.clusters_df['normal'][matching_clusters_idx] = "InNormal"

In [51]:
output.clusters_df

Unnamed: 0,junction_aa,cluster,cancer,stringentCancer,normal,stringentNormal
0,CASSIGEPDEQYF,0,Cancer,Yes,InNormal,No
1,CASSIGEPYEQYF,0,Cancer,Yes,InNormal,Yes
2,CASSLGEGYEQYF,0,Cancer,No,InNormal,Yes
3,CASSVGETYEQYF,0,Cancer,No,InNormal,Yes
4,CASSIGETYEQYF,0,Cancer,No,InNormal,Yes
...,...,...,...,...,...,...
64866,CASSGTGTVRTNEKLFF,6504,Not Cancer Associated,No,InNormal,Yes
64867,CASSLGTGPSGANVLTF,6505,Not Cancer Associated,No,InNormal,Yes
64868,CASSLGTGSSGANVLTF,6505,Not Cancer Associated,No,InNormal,Yes
64869,CASSYDREGSTNEKLFF,6506,Cancer,Yes,AbsentNormal,No


In [52]:
output.clusters_df.to_csv('../data/processed/AllSampleCluster.csv', index=False)