In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re

In [208]:
# Get list of sample clonotypes
summary_files = [
    f for f in 
    glob.iglob(
        "/nfs/users/nfs_b/bb9/workspace/rotation2/team115_lustre/data/existing_bcr_data/MalariaSamplesBen/**/*_Summary.txt", 
        recursive=True, 
    )
]

# Extract sample day and number from filename
p = re.compile(r'Day(\d+)/LEA_S(\d+)_Summary.txt')
days = [p.search(f).group(1) for f in summary_files]
sample_nums = [p.search(f).group(2) for f in summary_files]

# Read in all summary files
summary_df = pd.concat(
    (pd.read_table(f, header=0, usecols=list(range(0,30))) for f in summary_files),
    keys=zip(days, sample_nums),
    names=["day", "sample_num"]
)
summary_df.index.names = pd.core.base.FrozenList(['day', 'sample_num', 'seq_i'])

In [209]:
# Count unique BCRs per sample
print(summary_df.groupby(level="sample_num").apply(lambda x: x.shape))
# List possible Functionalities
print(summary_df["Functionality"].unique())
# Filter out unproductive and "No results"
summary_df = summary_df[summary_df["Functionality"].isin(['productive', 'productive (see comment)'])]
# Count unique BCRs per sample after filter
print(summary_df.groupby(level="sample_num").apply(lambda x: x.shape))

sample_num
1      (1455, 30)
10     (8114, 30)
11     (1122, 30)
12      (951, 30)
13     (2047, 30)
14     (1489, 30)
15     (1723, 30)
16     (3060, 30)
17    (23490, 30)
18    (11201, 30)
19    (20764, 30)
2      (1325, 30)
21    (15197, 30)
22    (26972, 30)
23    (50806, 30)
3      (2196, 30)
4       (488, 30)
5       (698, 30)
6       (985, 30)
7      (7545, 30)
8      (2155, 30)
9      (8393, 30)
dtype: object
['productive' 'productive (see comment)' 'unknown (see comment)'
 'unproductive' 'unproductive (see comment)' 'No results']
sample_num
1      (1445, 30)
10     (7974, 30)
11     (1099, 30)
12      (939, 30)
13     (2028, 30)
14     (1474, 30)
15     (1706, 30)
16     (3034, 30)
17    (23235, 30)
18    (11072, 30)
19    (20514, 30)
2      (1309, 30)
21    (15035, 30)
22    (26706, 30)
23    (50187, 30)
3      (2171, 30)
4       (485, 30)
5       (692, 30)
6       (975, 30)
7      (7503, 30)
8      (2145, 30)
9      (8310, 30)
dtype: object


In [210]:
# Convert row indices into columns
summary_df = summary_df.reset_index(level=summary_df.index.names)
del summary_df["seq_i"]

In [211]:
# Get V J gene assignments by taking the first assignment, collapsing alleles.
summary_df["V-GENE"] = [x.split(", or ")[0].split("*")[0].replace("Homsap ", "") for x in summary_df["V-GENE and allele"]]
summary_df["J-GENE"] = [x.split(", or ")[0].split("*")[0].replace("Homsap ", "") for x in summary_df["J-GENE and allele"]]
# Check there aren't any wierd CDR3 lengths
summary_df["CDR3-IMGT length"].unique()

array(['23', '14', '13', '20', '18', '15', '10', '8', '11', '7', '9', '12',
       '16', '6', '17', '22', '19', '21', '5', '24', '25', '29', '27',
       '26', '28', '31', '41', '30', '34', '4', '32', '3', '35', '33',
       '54', '37', '1', '2', '40', '44', '49', '36'], dtype=object)

In [212]:
# Define clonotype filter
# - V gene, J gene, CDR3 length
clonotype_filter = ["V-GENE", "J-GENE", "CDR3-IMGT length"]
def clonotype_format(x): return "{}.{}.{}".format(*x)

# Apply clonotype filter
summary_df["clonotype"] = summary_df[clonotype_filter].apply(clonotype_format, axis=1)

In [213]:
summary_df

Unnamed: 0,day,sample_num,Sequence number,Sequence ID,digest,Functionality,V-GENE and allele,V-REGION score,V-REGION identity %,V-REGION identity nt,...,Orientation,Functionality comment,V-REGION potential ins/del,J-GENE and allele comment,V-REGION insertions,V-REGION deletions,Sequence,V-GENE,J-GENE,clonotype
0,0,16,1,MS7_20154111092026411880__0_0_0_0_0_1_0_0_0_0_...,IGHG1,productive,"Homsap IGHV3-23*01 F, or Homsap IGHV3-23*04 F ...",845.0,92.39,182/197 nt,...,+,,,,,,cctatgccatgacctgggtccgccaggctccagggaagggtctgga...,IGHV3-23,IGHJ3,IGHV3-23.IGHJ3.23
1,0,16,2,MS7_20154111042072910943__1_0_0_0_0_0_0_0_0_0_...,IGHA1,productive,Homsap IGHV1-18*01 F,852.0,93.78,181/193 nt,...,+,,,,,,tggtgtcagctgggtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-18,IGHJ4,IGHV1-18.IGHJ4.14
2,0,16,3,MS7_2015411101802120649__0_0_0_0_0_0_1_0_0_0_0...,IGHG2,productive,Homsap IGHV1-2*02 F,960.0,100.00,193/193 nt,...,+,,,,,,ctatatgcactgggtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-2,IGHJ5,IGHV1-2.IGHJ5.13
3,0,16,4,MS7_20154111121956212740__0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV2-70*01 F,1177.0,97.19,242/249 nt,...,+,,,,,,acacagaccctcacactgacctgcaccttctctgggttctcactca...,IGHV2-70,IGHJ6,IGHV2-70.IGHJ6.20
4,0,16,5,MS7_2015411107256174723__0_0_0_0_0_1_0_0_0_0_0...,IGHG1,productive,Homsap IGHV4-30-4*01 F,828.0,85.40,193/226 nt,...,+,,,,,,cagtgtctctggtgactacatcagccaaggtgattactactggact...,IGHV4-30-4,IGHJ4,IGHV4-30-4.IGHJ4.18
5,0,16,6,MS7_20154111102474611152__0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV3-49*04 F,911.0,94.58,192/203 nt,...,+,,,,,,attatgctatgagttgggtccgccaggctccagggaaggggctgga...,IGHV3-49,IGHJ3,IGHV3-49.IGHJ3.15
6,0,16,7,MS7_201541110596759208__0_0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV3-48*03 F,917.0,96.45,190/197 nt,...,+,,,,,,gttatgaaatgaactgggtccgccaggctccagggaaggggctgga...,IGHV3-48,IGHJ4,IGHV3-48.IGHJ4.14
7,0,16,8,MS7_2015412117448121042__0_0_0_0_0_1_0_0_0_0_0...,IGHG1,productive,"Homsap IGHV3-23*01 F, or Homsap IGHV3-23*04 F ...",854.0,92.89,183/197 nt,...,+,,,,,,cttatgccatggcctgggtccgccaggctccagggaagggtctgga...,IGHV3-23,IGHJ3,IGHV3-23.IGHJ3.23
8,0,16,9,MS7_20154111191727013216__0_1_0_0_0_0_0_0_0_0_...,IGHA2,productive,"Homsap IGHV4-4*02 F, or Homsap IGHV4-4*03 F or...",930.0,91.03,203/223 nt,...,+,,,,,,cgctgtctctggtggctccctcagcagtagtgagtggataagttgg...,IGHV4-4,IGHJ5,IGHV4-4.IGHJ5.10
9,0,16,10,MS7_201541211749108038__1_0_0_0_0_0_0_0_0_0_0_...,IGHA1,productive,"Homsap IGHV1-2*02 F, or Homsap IGHV1-2*05 F",861.0,94.30,182/193 nt,...,+,,,,,,ctatattcactgcgtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-2,IGHJ4,IGHV1-2.IGHJ4.14


In [219]:
summary_df.groupby("clonotype").apply(lambda x: x.shape)

clonotype
IGHV1-18.IGHJ1.10      (6, 35)
IGHV1-18.IGHJ1.11      (4, 35)
IGHV1-18.IGHJ1.12     (21, 35)
IGHV1-18.IGHJ1.13     (35, 35)
IGHV1-18.IGHJ1.14     (85, 35)
IGHV1-18.IGHJ1.15     (23, 35)
IGHV1-18.IGHJ1.16     (34, 35)
IGHV1-18.IGHJ1.17     (58, 35)
IGHV1-18.IGHJ1.18      (9, 35)
IGHV1-18.IGHJ1.19     (16, 35)
IGHV1-18.IGHJ1.20      (5, 35)
IGHV1-18.IGHJ1.21      (7, 35)
IGHV1-18.IGHJ1.22      (1, 35)
IGHV1-18.IGHJ1.23      (2, 35)
IGHV1-18.IGHJ1.24      (2, 35)
IGHV1-18.IGHJ1.25      (1, 35)
IGHV1-18.IGHJ1.5       (3, 35)
IGHV1-18.IGHJ1.6       (1, 35)
IGHV1-18.IGHJ1.8       (2, 35)
IGHV1-18.IGHJ1.9       (3, 35)
IGHV1-18.IGHJ2.10      (3, 35)
IGHV1-18.IGHJ2.11      (4, 35)
IGHV1-18.IGHJ2.12      (5, 35)
IGHV1-18.IGHJ2.13     (42, 35)
IGHV1-18.IGHJ2.14     (18, 35)
IGHV1-18.IGHJ2.15     (10, 35)
IGHV1-18.IGHJ2.16     (59, 35)
IGHV1-18.IGHJ2.17     (14, 35)
IGHV1-18.IGHJ2.18     (23, 35)
IGHV1-18.IGHJ2.19     (15, 35)
                        ...   
IGHV7-4-1.IGHJ4.8      (8, 35