In [1]:
import pandas as pd

In [2]:
## Read in the original samples file
samples_file = "samples.demo.txt"
samples = pd.read_csv(samples_file, sep="\t")
samples

Unnamed: 0,Name,r1
0,Treated_1,Treated_1.fastq.gz
1,Treated_2,Treated_2.fastq.gz
2,Treated_3,Treated_3.fastq.gz
3,Control_1,Control_1.fastq.gz
4,Control_2,Control_2.fastq.gz
5,Control_3,Control_3.fastq.gz


In [3]:
## First, let's add groups to the samples
samples["group"] = samples["Name"].str.split("_").str[0]
samples


Unnamed: 0,Name,r1,group
0,Treated_1,Treated_1.fastq.gz,Treated
1,Treated_2,Treated_2.fastq.gz,Treated
2,Treated_3,Treated_3.fastq.gz,Treated
3,Control_1,Control_1.fastq.gz,Control
4,Control_2,Control_2.fastq.gz,Control
5,Control_3,Control_3.fastq.gz,Control


In [4]:
# Write the file
samples.to_csv("04_replot.ipynb.samples.txt", sep="\t", index=False)

In [5]:
# Next, read in the aggregated stats file
agg_stats_file = "CRISPRessoSea_output_on_samples.demo.txt/aggregated_stats_all.txt"
agg_stats = pd.read_csv(agg_stats_file, sep="\t")
agg_stats

Unnamed: 0,target_id,target_name,sort_index,target_chr,target_pos,target_seq_no_gaps_with_pam,target_seq_no_gaps,target_seq_with_gaps,target_pam,ontarget_name,...,Control_2_pooled_result_name,Control_2_highest_a_g_pct,Control_2_highest_c_t_pct,Control_2_highest_indel_pct,Control_2_tot_reads,Control_3_pooled_result_name,Control_3_highest_a_g_pct,Control_3_highest_c_t_pct,Control_3_highest_indel_pct,Control_3_tot_reads
0,0_CTLA4_site9_ON,CTLA4_site9_ON_CTLA4_site0_500,0,CTLA4_site0,500,GGACTGAGGGCCATGGACACGGG,GGACTGAGGGCCATGGACAC,GGACTGAGGGCCATGGACAC,GGG,CTLA4_site9,...,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.0,0.0,117.0,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.833333,0.0,120.0
1,1_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site1_500,1,CTLA4_site1,500,GGACaGAGGGCCcTGGACACAGG,GGACaGAGGGCCcTGGACAC,GGACaGAGGGCCcTGGACAC,AGG,CTLA4_site9,...,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,34.0,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,32.0
2,2_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site2_500,2,CTLA4_site2,500,GGAaTGAGGcCCATGGACACTGG,GGAaTGAGGcCCATGGACAC,GGAaTGAGGcCCATGGACAC,TGG,CTLA4_site9,...,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,149.0,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,145.0
3,3_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site3_500,3,CTLA4_site3,500,GGACTGgGGGCCtTGGACACAGG,GGACTGgGGGCCtTGGACAC,GGACTGgGGGCCtTGGACAC,AGG,CTLA4_site9,...,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,81.0,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,1.098901,0.0,0.0,91.0


In [6]:
# site1 in row 2 has the least editing so let's move it to the bottom
agg_stats = agg_stats.reindex([0, 2, 3, 1])
agg_stats

Unnamed: 0,target_id,target_name,sort_index,target_chr,target_pos,target_seq_no_gaps_with_pam,target_seq_no_gaps,target_seq_with_gaps,target_pam,ontarget_name,...,Control_2_pooled_result_name,Control_2_highest_a_g_pct,Control_2_highest_c_t_pct,Control_2_highest_indel_pct,Control_2_tot_reads,Control_3_pooled_result_name,Control_3_highest_a_g_pct,Control_3_highest_c_t_pct,Control_3_highest_indel_pct,Control_3_tot_reads
0,0_CTLA4_site9_ON,CTLA4_site9_ON_CTLA4_site0_500,0,CTLA4_site0,500,GGACTGAGGGCCATGGACACGGG,GGACTGAGGGCCATGGACAC,GGACTGAGGGCCATGGACAC,GGG,CTLA4_site9,...,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.0,0.0,117.0,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.833333,0.0,120.0
2,2_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site2_500,2,CTLA4_site2,500,GGAaTGAGGcCCATGGACACTGG,GGAaTGAGGcCCATGGACAC,GGAaTGAGGcCCATGGACAC,TGG,CTLA4_site9,...,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,149.0,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,145.0
3,3_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site3_500,3,CTLA4_site3,500,GGACTGgGGGCCtTGGACACAGG,GGACTGgGGGCCtTGGACAC,GGACTGgGGGCCtTGGACAC,AGG,CTLA4_site9,...,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,81.0,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,1.098901,0.0,0.0,91.0
1,1_CTLA4_site9_OB2,CTLA4_site9_OB2_CTLA4_site1_500,1,CTLA4_site1,500,GGACaGAGGGCCcTGGACACAGG,GGACaGAGGGCCcTGGACAC,GGACaGAGGGCCcTGGACAC,AGG,CTLA4_site9,...,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,34.0,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,32.0


In [7]:
# Now, set custom names the sites
agg_stats['target_name'] = ['chr2:203MB ON', 'chr5:101MB OB2', 'chr9:121MB OB2', 'chr9:135MB OB2']
agg_stats

Unnamed: 0,target_id,target_name,sort_index,target_chr,target_pos,target_seq_no_gaps_with_pam,target_seq_no_gaps,target_seq_with_gaps,target_pam,ontarget_name,...,Control_2_pooled_result_name,Control_2_highest_a_g_pct,Control_2_highest_c_t_pct,Control_2_highest_indel_pct,Control_2_tot_reads,Control_3_pooled_result_name,Control_3_highest_a_g_pct,Control_3_highest_c_t_pct,Control_3_highest_indel_pct,Control_3_tot_reads
0,0_CTLA4_site9_ON,chr2:203MB ON,0,CTLA4_site0,500,GGACTGAGGGCCATGGACACGGG,GGACTGAGGGCCATGGACAC,GGACTGAGGGCCATGGACAC,GGG,CTLA4_site9,...,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.0,0.0,117.0,CTLA4_site0_425_675_CTLA4_site9_ON_CTLA4_site0...,0.0,0.833333,0.0,120.0
2,2_CTLA4_site9_OB2,chr5:101MB OB2,2,CTLA4_site2,500,GGAaTGAGGcCCATGGACACTGG,GGAaTGAGGcCCATGGACAC,GGAaTGAGGcCCATGGACAC,TGG,CTLA4_site9,...,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,149.0,CTLA4_site2_374_614_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,145.0
3,3_CTLA4_site9_OB2,chr9:121MB OB2,3,CTLA4_site3,500,GGACTGgGGGCCtTGGACACAGG,GGACTGgGGGCCtTGGACAC,GGACTGgGGGCCtTGGACAC,AGG,CTLA4_site9,...,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,81.0,CTLA4_site3_425_675_CTLA4_site9_OB2_CTLA4_site...,1.098901,0.0,0.0,91.0
1,1_CTLA4_site9_OB2,chr9:135MB OB2,1,CTLA4_site1,500,GGACaGAGGGCCcTGGACACAGG,GGACaGAGGGCCcTGGACAC,GGACaGAGGGCCcTGGACAC,AGG,CTLA4_site9,...,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,34.0,CTLA4_site1_358_599_CTLA4_site9_OB2_CTLA4_site...,0.0,0.0,0.0,32.0


In [8]:
# Write the new stats file for processing
agg_stats.to_csv("04_replot.ipynb.agg_stats.txt", sep="\t", index=False)

In [None]:
# Call Replot and add statistical tests
!CRISPRessoSea Replot -o 04_replot.ipynb.output --reordered_stats_file 04_replot.ipynb.agg_stats.txt --reordered_sample_file 04_replot.ipynb.samples.txt --sig_method_parameters t_test,Control,Treated,0.05 

INFO  @ Mon, 30 Jun 2025 17:30:45 (20.0% done):
	 Plotting for 4 targets 

INFO  @ Mon, 30 Jun 2025 17:30:46 (20.0% done):
	 Saved p-values to 04_replot.ipynb.output/CRISPRessoSea_all_highest_a_g_pct.p_values.txt 

INFO  @ Mon, 30 Jun 2025 17:30:46 (20.0% done):
	 Plotted heatmap to 04_replot.ipynb.output/CRISPRessoSea_all_highest_a_g_pct 

INFO  @ Mon, 30 Jun 2025 17:30:49 (20.0% done):
	 Saved p-values to 04_replot.ipynb.output/CRISPRessoSea_all_highest_c_t_pct.p_values.txt 

INFO  @ Mon, 30 Jun 2025 17:30:49 (20.0% done):
	 Plotted heatmap to 04_replot.ipynb.output/CRISPRessoSea_all_highest_c_t_pct 

INFO  @ Mon, 30 Jun 2025 17:30:50 (20.0% done):
	 Saved p-values to 04_replot.ipynb.output/CRISPRessoSea_all_highest_indel_pct.p_values.txt 

INFO  @ Mon, 30 Jun 2025 17:30:51 (20.0% done):
	 Plotted heatmap to 04_replot.ipynb.output/CRISPRessoSea_all_highest_indel_pct 

INFO  @ Mon, 30 Jun 2025 17:30:52 (20.0% done):
	 Saved p-values to 04_replot.ipynb.output/CRISPRessoSea_all_tot_read

In [10]:
# We can see the p-values in the output tables
p_vals = pd.read_csv('04_replot.ipynb.output/CRISPRessoSea_all_highest_indel_pct.p_values.txt',sep="\t")
p_vals

Unnamed: 0,target_name,Control_mean,Treated_mean,t_stat,p_val,test,bh_adj_p_val,significant,Treated_1,Treated_2,Treated_3,Control_1,Control_2,Control_3
0,chr2:203MB ON,0.0,71.862506,-17.980458,0.003079,Welch T-test,0.012315,True,65.625,79.310345,70.652174,0.0,0.0,0.0
1,chr5:101MB OB2,0.0,17.634946,-12.019909,0.00685,Welch T-test,0.013701,True,14.864865,18.181818,19.858156,0.0,0.0,0.0
2,chr9:121MB OB2,0.0,1.611688,-4.060365,0.055641,Welch T-test,0.074188,False,1.298701,1.136364,2.4,0.0,0.0,0.0
3,chr9:135MB OB2,0.0,0.793651,-1.0,0.42265,Welch T-test,0.42265,False,0.0,0.0,2.380952,0.0,0.0,0.0
