## Analysis of Cas13a guides
by Duo peng  
  


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
from pathlib import Path

## Read the guides table

In [2]:
guide_table = Path.cwd() / "guides_table.tsv"
df_guides = pd.read_csv(guide_table, sep='\t', header=None)


  df_guides = pd.read_csv(guide_table, sep='\t', header=None)


In [3]:
df_guides.columns = ["segment (from sequence desc.)", 'guide', 'segment:targeted_variant_count',"subtype_targeted_count", "strain_targeted_count", "sequenceID (bvbrc)","start","target","spacer","strand","GC_content","A_content"]

#remove rows where guides are invalid
df_guides.drop(df_guides[(df_guides['guide'] == "nnnnnnnnnnnnnnnnnnnn") | (df_guides['guide'] == "aaaaaaaaaaaaaaaaaaaa") | (df_guides['guide'] == "target")].index, inplace=True)

df_guides["segment (from sequence desc.)"] = df_guides["segment (from sequence desc.)"].fillna(0).astype(int).astype(str)
df_guides["segment (from sequence desc.)"].replace("0", np.nan, inplace=True)

df_guides["segment:targeted_variant_count"] = df_guides["segment:targeted_variant_count"].fillna(pd.NA)

#remove rows where segment is invalid
#df_guides.dropna(subset=["segment"], inplace=True)

#reindex
df_guides.reset_index(drop=True, inplace=True)

In [4]:
pd.isna(df_guides.iloc[14249235]['segment:targeted_variant_count'])

True

compute total segment variants targeted by each guide

In [5]:
def compute_total_seg_targeted(input_string):
    if pd.isna(input_string):
        return 0
    pairs = input_string.split("|")
    total = 0
    for pair in pairs:
        if pair != ":":
            key, value = pair.split(":")
            total += int(value)
    return total

df_guides.insert(3, "total_count_targeted_segment", 
                df_guides["segment:targeted_variant_count"].apply(compute_total_seg_targeted))



In [6]:
df_guides

Unnamed: 0,segment (from sequence desc.),guide,segment:targeted_variant_count,total_count_targeted_segment,subtype_targeted_count,strain_targeted_count,sequenceID (bvbrc),start,target,spacer,strand,GC_content,A_content
0,6,aacagcttcgaacagataac,6:9|8:886,895,23,287,CY164524,760,aacagcttcgaacagataac,guuaucuguucgaagcuguu,+,0.73,0.15
1,6,acagaaaacagcttcgaaca,6:9|8:786,795,7,144,CY146648,725,acagaaaacagcttcgaaca,uguucgaagcuguuuucugu,+,0.80,0.10
2,6,ttcgaacagataacgtttat,6:9|8:771,780,25,311,CY164524,766,ttcgaacagataacgtttat,auaaacguuaucuguucgaa,+,0.46,0.35
3,6,tcgaacagataacgtttatg,6:9|8:771,780,25,310,CY164524,767,tcgaacagataacgtttatg,cauaaacguuaucuguucga,+,0.54,0.30
4,6,cgaacagataacgtttatgc,6:9|8:771,780,25,310,CY164524,768,cgaacagataacgtttatgc,gcauaaacguuaucuguucg,+,0.62,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14249231,,tgacaataatgtattgtccc,,0,0,1,CY155150,540,tgacaataatgtattgtccc,gggacaauacauuauuguca,+,0.50,0.35
14249232,,gacaataatgtattgtccct,,0,0,1,CY155150,541,gacaataatgtattgtccct,agggacaauacauuauuguc,+,0.50,0.35
14249233,,acaataatgtattgtccctg,,0,0,1,CY155150,542,acaataatgtattgtccctg,cagggacaauacauuauugu,+,0.50,0.35
14249234,,caataatgtattgtccctga,,0,0,1,CY155150,543,caataatgtattgtccctga,ucagggacaauacauuauug,+,0.50,0.35


In [7]:
df_guides.to_csv(Path.cwd() / "guides_table_parsed.tsv", index=False, sep="\t")

In [8]:
# sort by strain_targeted_count
df_guides.sort_values(by=['strain_targeted_count'], inplace=True, ascending=False)

In [9]:
df_guides

Unnamed: 0,segment (from sequence desc.),guide,segment:targeted_variant_count,total_count_targeted_segment,subtype_targeted_count,strain_targeted_count,sequenceID (bvbrc),start,target,spacer,strand,GC_content,A_content
2420529,6,tcaggccccctcaaagccga,4:1|6:13|7:32075,32089,72,8932,CY129919,63,tcaggccccctcaaagccga,ucggcuuugagggggccuga,+,0.87,0.10
10379293,7,tcaggccccctcaaagccga,7:103096|4:4|6:13|1:1,103114,72,8932,CY129919,63,tcaggccccctcaaagccga,ucggcuuugagggggccuga,+,0.87,0.10
12786183,,tcaggccccctcaaagccga,7:4527808|4:372|6:1090|1:51,4529321,72,8932,CY129919,63,tcaggccccctcaaagccga,ucggcuuugagggggccuga,+,0.87,0.10
4568103,1,tcaggccccctcaaagccga,1:1|7:10490,10491,72,8932,CY129919,63,tcaggccccctcaaagccga,ucggcuuugagggggccuga,+,0.87,0.10
5946158,4,tcaggccccctcaaagccga,4:4|7:18644|6:5,18653,72,8932,CY129919,63,tcaggccccctcaaagccga,ucggcuuugagggggccuga,+,0.87,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6269992,4,agtgtggtagaaactagaat,4:32,32,0,1,KP766653,1225,agtgtggtagaaactagaat,auucuaguuucuaccacacu,+,0.58,0.25
6269993,4,tgtggtagaaactagaatta,4:32,32,0,1,KP766653,1227,tgtggtagaaactagaatta,uaauucuaguuucuaccaca,+,0.50,0.30
6270031,4,tggccaaaatgtacaagcag,4:32,32,0,1,KP766655,1181,tggccaaaatgtacaagcag,cugcuuguacauuuuggcca,+,0.75,0.15
6270032,4,ggccaaaatgtacaagcagt,4:32,32,0,1,KP766655,1182,ggccaaaatgtacaagcagt,acugcuuguacauuuuggcc,+,0.75,0.15
