In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re

In [55]:
# Get list of sample clonotypes
summary_files = [
    f for f in 
    glob.iglob(
        "/nfs/users/nfs_b/bb9/workspace/rotation2/team115_lustre/data/existing_bcr_data/MalariaSamplesBen/**/*_Summary.txt", 
        recursive=True, 
    )
]

# Extract sample day and number from filename
p = re.compile(r'Day(\d+)/LEA_S(\d+)_Summary.txt')
days = [p.search(f).group(1) for f in summary_files]
sample_nums = [p.search(f).group(2) for f in summary_files]

# Read in all summary files
summary_df = pd.concat(
    (pd.read_table(f, header=0, usecols=list(range(0,30))) for f in summary_files),
    keys=zip(days, sample_nums),
    names=["day", "sample_num"]
)
summary_df.index.names = pd.core.base.FrozenList(['day', 'sample_num', 'seq_i'])

# Count unique BCRs per sample
print(summary_df.groupby(level="sample_num").apply(lambda x: x.shape))
# List possible Functionalities
print(summary_df["Functionality"].unique())
# Filter out unproductive and "No results"
summary_df = summary_df[summary_df["Functionality"].isin(['productive', 'productive (see comment)'])]
# Count unique BCRs per sample after filter
print(summary_df.groupby(level="sample_num").apply(lambda x: x.shape))

# Convert row indices into columns
summary_df = summary_df.reset_index(level=summary_df.index.names)
del summary_df["seq_i"]

# Get V J gene assignments by taking the first assignment, collapsing alleles.
summary_df["V-GENE"] = [x.split(", or ")[0].split("*")[0].replace("Homsap ", "") for x in summary_df["V-GENE and allele"]]
summary_df["J-GENE"] = [x.split(", or ")[0].split("*")[0].replace("Homsap ", "") for x in summary_df["J-GENE and allele"]]

summary_df["CDR3_len"] = summary_df["CDR3-IMGT length"]
summary_df[["day", "sample_num"]] = summary_df[["day", "sample_num"]].apply(pd.to_numeric)

sample_num
1      (1455, 30)
10     (8114, 30)
11     (1122, 30)
12      (951, 30)
13     (2047, 30)
14     (1489, 30)
15     (1723, 30)
16     (3060, 30)
17    (23490, 30)
18    (11201, 30)
19    (20764, 30)
2      (1325, 30)
21    (15197, 30)
22    (26972, 30)
23    (50806, 30)
3      (2196, 30)
4       (488, 30)
5       (698, 30)
6       (985, 30)
7      (7545, 30)
8      (2155, 30)
9      (8393, 30)
dtype: object
['productive' 'productive (see comment)' 'unknown (see comment)'
 'unproductive' 'unproductive (see comment)' 'No results']
sample_num
1      (1445, 30)
10     (7974, 30)
11     (1099, 30)
12      (939, 30)
13     (2028, 30)
14     (1474, 30)
15     (1706, 30)
16     (3034, 30)
17    (23235, 30)
18    (11072, 30)
19    (20514, 30)
2      (1309, 30)
21    (15035, 30)
22    (26706, 30)
23    (50187, 30)
3      (2171, 30)
4       (485, 30)
5       (692, 30)
6       (975, 30)
7      (7503, 30)
8      (2145, 30)
9      (8310, 30)
dtype: object


In [52]:
# Define clonotype
# - V gene, J gene, CDR3 length
clonotype_filter = ["V-GENE", "J-GENE", "CDR3_len"]
def clonotype_format(x): return "{}.{}.{}".format(*x)

In [56]:
# Get clonotypes for known mAbs
mAb_df = pd.read_csv("/nfs/users/nfs_b/bb9/workspace/rotation2/team115_lustre/data/existing_bcr_data/MalariaSamplesBen/IgBlast_bnAbs.csv")
mAb_df["V-GENE"] = [x.split(",")[0].split("*")[0] for x in mAb_df["V gene"]]
mAb_df["J-GENE"] = [x.split(",")[0].split("*")[0] for x in mAb_df["J gene"]]
mAb_df["CDR3_len"] = mAb_df["CDR3 amino acid seq"].map(len)
mAb_df["clonotype"] = mAb_df[clonotype_filter].apply(clonotype_format, axis=1)

# Get clonotypes for existing data
summary_df["clonotype"] = summary_df[clonotype_filter].apply(clonotype_format, axis=1)

# Read in sample information
sample_info_df = pd.read_excel("/nfs/users/nfs_b/bb9/workspace/rotation2/team115_lustre/data/existing_bcr_data/MalariaSamplesBen/Malaria_Samples_SeqInfo.xlsx")

# Merge in patient number and cell type
summary_df = pd.merge(summary_df, sample_info_df[["Tag Index", "patient_code", "cell_type"]], 
         how="left", left_on="sample_num", right_on="Tag Index")

# Merge in known mAb name
summary_df = pd.merge(summary_df, mAb_df[["Ab.Name", "clonotype"]], 
         how="left", on="clonotype")
summary_df

Unnamed: 0,day,sample_num,Sequence number,Sequence ID,digest,Functionality,V-GENE and allele,V-REGION score,V-REGION identity %,V-REGION identity nt,...,V-REGION deletions,Sequence,V-GENE,J-GENE,CDR3_len,clonotype,Tag Index,patient_code,cell_type,Ab.Name
0,0,16,1,MS7_20154111092026411880__0_0_0_0_0_1_0_0_0_0_...,IGHG1,productive,"Homsap IGHV3-23*01 F, or Homsap IGHV3-23*04 F ...",845.0,92.39,182/197 nt,...,,cctatgccatgacctgggtccgccaggctccagggaagggtctgga...,IGHV3-23,IGHJ3,23,IGHV3-23.IGHJ3.23,16,2207,plasma,
1,0,16,2,MS7_20154111042072910943__1_0_0_0_0_0_0_0_0_0_...,IGHA1,productive,Homsap IGHV1-18*01 F,852.0,93.78,181/193 nt,...,,tggtgtcagctgggtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-18,IGHJ4,14,IGHV1-18.IGHJ4.14,16,2207,plasma,
2,0,16,3,MS7_2015411101802120649__0_0_0_0_0_0_1_0_0_0_0...,IGHG2,productive,Homsap IGHV1-2*02 F,960.0,100.00,193/193 nt,...,,ctatatgcactgggtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-2,IGHJ5,13,IGHV1-2.IGHJ5.13,16,2207,plasma,
3,0,16,4,MS7_20154111121956212740__0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV2-70*01 F,1177.0,97.19,242/249 nt,...,,acacagaccctcacactgacctgcaccttctctgggttctcactca...,IGHV2-70,IGHJ6,20,IGHV2-70.IGHJ6.20,16,2207,plasma,
4,0,16,5,MS7_2015411107256174723__0_0_0_0_0_1_0_0_0_0_0...,IGHG1,productive,Homsap IGHV4-30-4*01 F,828.0,85.40,193/226 nt,...,,cagtgtctctggtgactacatcagccaaggtgattactactggact...,IGHV4-30-4,IGHJ4,18,IGHV4-30-4.IGHJ4.18,16,2207,plasma,
5,0,16,6,MS7_20154111102474611152__0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV3-49*04 F,911.0,94.58,192/203 nt,...,,attatgctatgagttgggtccgccaggctccagggaaggggctgga...,IGHV3-49,IGHJ3,15,IGHV3-49.IGHJ3.15,16,2207,plasma,
6,0,16,7,MS7_201541110596759208__0_0_0_0_0_0_0_0_0_0_0_...,IGHM,productive,Homsap IGHV3-48*03 F,917.0,96.45,190/197 nt,...,,gttatgaaatgaactgggtccgccaggctccagggaaggggctgga...,IGHV3-48,IGHJ4,14,IGHV3-48.IGHJ4.14,16,2207,plasma,
7,0,16,8,MS7_2015412117448121042__0_0_0_0_0_1_0_0_0_0_0...,IGHG1,productive,"Homsap IGHV3-23*01 F, or Homsap IGHV3-23*04 F ...",854.0,92.89,183/197 nt,...,,cttatgccatggcctgggtccgccaggctccagggaagggtctgga...,IGHV3-23,IGHJ3,23,IGHV3-23.IGHJ3.23,16,2207,plasma,
8,0,16,9,MS7_20154111191727013216__0_1_0_0_0_0_0_0_0_0_...,IGHA2,productive,"Homsap IGHV4-4*02 F, or Homsap IGHV4-4*03 F or...",930.0,91.03,203/223 nt,...,,cgctgtctctggtggctccctcagcagtagtgagtggataagttgg...,IGHV4-4,IGHJ5,10,IGHV4-4.IGHJ5.10,16,2207,plasma,
9,0,16,10,MS7_201541211749108038__1_0_0_0_0_0_0_0_0_0_0_...,IGHA1,productive,"Homsap IGHV1-2*02 F, or Homsap IGHV1-2*05 F",861.0,94.30,182/193 nt,...,,ctatattcactgcgtgcgacaggcccctggacaagggcttgagtgg...,IGHV1-2,IGHJ4,14,IGHV1-2.IGHJ4.14,16,2207,plasma,



# Questions to ask

How is immunity to the vaccine candidate achieved?

- due to antibodies... which we detect via expansion and mutational frequency


- V(D)J gene frequencies, Ig isotype usage, and BCR clone size, stratified by timepoint and patient
- Look for expansion of clonotypes in day 63 and 140 vs. day 0
- Compare expanded clonotypes to known Ab sequences
- Look for differences between patients at day 0
- Calculate mutational frequencies of clonotypes

Plots:

- Area/bar/parallel coords chart showing expansion of clonotypes (known V genes?) over time

## Comparison across individuals at pre-prime (day 0)

RH5 and Duffy samples

- Distribution of clonotypes between individuals
- How diverse is the repertoire? Gini index?
- Run comparison across cell types

## Clonotypes at early post prime (day 28)

Look for:

- clonotype freq
- isotype distribution
- mutational freq (1-% identity) vs. IMGT V gene reference
    - linear regression of BCR mutational frequency against clonotype V gene status
    - mutational freq stratified by isotype
    
In the compartments:

- For memory cells vs day 0
- For plasmablasts vs naive repertoire (IgD/IgM unmutated sequences taken from PBMC samples at Day 0)

- Are expanded/mutated clonotypes the same as the
    - known anti-RH5 Ab sequences
    - ones observed in AMA1 and Duffy trials
    - ones observed in influenza and other infection challenge

## Clonotypes at early post boost (day 63)

Look for:

- clonotype freq
- isotype distribution
- mutational freq (1-% identity) vs. IMGT V gene reference
    - linear regression of BCR mutational frequency against clonotype V gene status
    - mutational freq stratified by isotype
    
In the compartments:

- For memory cells vs day 0 and day 28
- For plasmablasts vs plasmoblasts at day 0 and day 28
    - activation of same clonotypes during prime and boost?
- For plasmablasts vs naive repertoire at day 0 and day 28
- For plasmablasts vs memory repertoire at day 0 and day 28
    - recall of memory cells generated at the prime?

- clonotype overlap between 

## Clonotypes at long term (day 140)

Look for:

- clonotype freq

In the compartments:

- For plasmablasts vs memory repertoire at day 63
    - how much of the boost response is detectable in long term memory?

## Identification of malaria specific mAbs

- Overlap with known anti RH5 mAbs (isolated from day 63), or clonotypes appearing in other infections/vaccinations in the literature
- Can we expect cross-reactivity in clonotypes that would affect the inferences based on the BCR repertoire data? 



In [66]:
# Day 0: distribution of clonotypes between individuals


summary_df.query("day == 0").groupby("patient_code").apply(lambda x: getattr(x,"shape"))

patient_code
1017    (25989, 40)
1019    (22552, 40)
2207    (31446, 40)
dtype: object

In [123]:
%matplotlib notebook

foo = summary_df.query("day == 0").groupby(["patient_code", "V-GENE"]).size().unstack().fillna(0)
foo["patient_code"] = foo["patient_code"]foo["patient_code"] 
print(foo)

foo.plot(kind="line", stacked=True, legend=None)
# Normalise by the number of day 0 reads associated with each patient?


V-GENE        IGHV1-18  IGHV1-2  IGHV1-24  IGHV1-3  IGHV1-45  IGHV1-46  \
patient_code                                                             
1017            2533.0    384.0     327.0    147.0       0.0     175.0   
1019            1945.0    311.0     319.0    176.0       0.0     205.0   
2207            2778.0   3213.0     386.0     83.0      11.0     659.0   

V-GENE        IGHV1-58  IGHV1-68  IGHV1-69  IGHV1-69-2    ...      IGHV4-34  \
patient_code                                              ...                 
1017             189.0       1.0     913.0         0.0    ...        2160.0   
1019             189.0       0.0     900.0         1.0    ...        2162.0   
2207             145.0       0.0     562.0         0.0    ...         780.0   

V-GENE        IGHV4-38-2  IGHV4-39  IGHV4-4  IGHV4-59  IGHV4-61  IGHV5-10-1  \
patient_code                                                                  
1017               492.0    1092.0    382.0    1490.0     444.0        12.0

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fd99b55f6d8>

In [None]:
help()