In [6]:
import os
import random
import pandas as pd
import numpy as np

In [7]:
input_folder = 'data/fig1_saturated'
output_folder = 'data/output'
poolQ_folder = 'data/poolQ'
ref_name = 'sgRNA_ref.csv'
condition_name = 'conditions.csv'

In [8]:
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [9]:
def run_poolQ(input_folder, poolQ_folder, ref_name, condition_name):
    
    # Use all fastq.gz files in the folder.
    listed_files = os.listdir(input_folder)
    fastq_file_names = [i for i in listed_files if 'fastq.gz' in i]
    template_seq = 'caccgNNNNNNNNNNNNNNNNNNNNgtttt'
    
    condition_path = os.path.join(input_folder, condition_name)
    ref_path = os.path.join(input_folder, ref_name)
    
    combined_res = pd.read_csv(ref_path, header=None)
    combined_res.columns = ['Row Barcode', 'Row Barcode IDs']

    for i in range(len(fastq_file_names)):
        file_name_1 = fastq_file_names[i]
        file_name_2 = file_name_1.replace("-", "_")
        name_split = file_name_2.split('_')
        ID = name_split[1]
        end_dir = name_split[2]
        short_name = '_'.join([end_dir, ID])
        
        # This is to run the command line
        fastq_path = os.path.join(input_folder, file_name_1)
        cmd = f'''{poolQ_folder}/poolq3.sh --reads {fastq_path} --col-reference {condition_path} --row-reference {ref_path} --row-barcode-policy TEMPLATE:{template_seq}@0 --col-barcode-policy FIXED:0'''
        !{cmd}

        temp_res = pd.read_table('counts.txt', header=0)
        temp_res = temp_res[['Row Barcode', 'exp']]
        temp_res.columns = ['Row Barcode', short_name]
        combined_res = combined_res.merge(temp_res, on='Row Barcode', how='left')
    
    return combined_res

In [10]:
combined_res = run_poolQ(input_folder, poolQ_folder, ref_name, condition_name)

[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file49_R1_001.fastq.gz
Row barcode match function:	mismatch
Column barcode match function:	exact
Row barcode policy:	TEMPLATE:caccgNNNNNNNNNNNNNNNNNNNNgtttt@0
Col barcode policy:	FIXED:0

[34m[INFO][0;39m Reading row reference data
[34m[INFO][0;39m Reading column reference data
[34m[INFO][0;39m Reading global reference data
[34m[INFO][0;39m Building row reference
[34m[INFO][0;39m Building column reference
[34m[INFO][0;39m Writing unexpected sequence cache files to /var/folders/py/mpy80_vj2lgg0bk277c880t80000gn/T/unexpected-sequence-cache7419292846104611403
[34m[INFO][0;39m Beginning task processing.
[34m[INFO][0;39m Processed 5000000 reads in 8757 ms (570.9718 reads/ms). Match percent: 96.42354; queue size: 100
[34m[INFO][0;39m Processed 10000000 

[34m[INFO][0;39m Shutting down.
[34m[INFO][0;39m Processed 36372864 reads in 64610 ms (562.96027 reads/ms). Match percent: 95.88931; queue size: 0
[34m[INFO][0;39m Writing counts file counts.txt
[34m[INFO][0;39m Writing quality file quality.txt
[34m[INFO][0;39m Writing log-normalized counts file lognormalized-counts.txt
[34m[INFO][0;39m Writing barcode counts file barcode-counts.txt
[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file18_R1_001.fastq.gz
Row barcode match function:	

[34m[INFO][0;39m Writing quality file quality.txt
[34m[INFO][0;39m Writing log-normalized counts file lognormalized-counts.txt
[34m[INFO][0;39m Writing barcode counts file barcode-counts.txt
[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file40_R1_001.fastq.gz
Row barcode match function:	mismatch
Column barcode match function:	exact
Row barcode policy:	TEMPLATE:caccgNNNNNNNNNNNNNNNNNNNNgtttt@0
Col barcode policy:	FIXED:0

[34m[INFO][0;39m Reading row reference data
[34m[INFO][0;39m

[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file24_R1_001.fastq.gz
Row barcode match function:	mismatch
Column barcode match function:	exact
Row barcode policy:	TEMPLATE:caccgNNNNNNNNNNNNNNNNNNNNgtttt@0
Col barcode policy:	FIXED:0

[34m[INFO][0;39m Reading row reference data
[34m[INFO][0;39m Reading column reference data
[34m[INFO][0;39m Reading global reference data
[34m[INFO][0;39m Building row reference
[34m[INFO][0;39m Building column reference
[34m[INFO][0;39m Writing une

[34m[INFO][0;39m Writing quality file quality.txt
[34m[INFO][0;39m Writing log-normalized counts file lognormalized-counts.txt
[34m[INFO][0;39m Writing barcode counts file barcode-counts.txt
[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file37_R1_001.fastq.gz
Row barcode match function:	mismatch
Column barcode match function:	exact
Row barcode policy:	TEMPLATE:caccgNNNNNNNNNNNNNNNNNNNNgtttt@0
Col barcode policy:	FIXED:0

[34m[INFO][0;39m Reading row reference data
[34m[INFO][0;39m

[34m[INFO][0;39m Writing counts file counts.txt
[34m[INFO][0;39m Writing quality file quality.txt
[34m[INFO][0;39m Writing log-normalized counts file lognormalized-counts.txt
[34m[INFO][0;39m Writing barcode counts file barcode-counts.txt
[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file28_R1_001.fastq.gz
Row barcode match function:	mismatch
Column barcode match function:	exact
Row barcode policy:	TEMPLATE:caccgNNNNNNNNNNNNNNNNNNNNgtttt@0
Col barcode policy:	FIXED:0

[34m[INFO][0

[34m[INFO][0;39m Shutting down.
[34m[INFO][0;39m Processed 18525303 reads in 34262 ms (540.6954 reads/ms). Match percent: 94.796844; queue size: 0
[34m[INFO][0;39m Writing counts file counts.txt
[34m[INFO][0;39m Writing quality file quality.txt
[34m[INFO][0;39m Writing log-normalized counts file lognormalized-counts.txt
[34m[INFO][0;39m Writing barcode counts file barcode-counts.txt
[34m[INFO][0;39m Writing correlation file correlation.txt
[31m[WARN][0;39m Skipping correlation file for trivial dataset (1 columns and 23995 rows)
[34m[INFO][0;39m Writing unexpected sequence report unexpected-sequences.txt
[34m[INFO][0;39m Writing run info unexpected-sequences.txt
[34m[INFO][0;39m PoolQ complete
[34m[INFO][0;39m PoolQ runtime configuration summary
Input files:
	Conditions file:	data/fig2_global_CYS063/conditions.csv
	Reference file:	data/fig2_global_CYS063/CYS063_ref.csv
	Reads file:	data/fig2_global_CYS063/global_file43_R1_001.fastq.gz
Row barcode match function:	

In [11]:
combined_res

Unnamed: 0,Row Barcode,Row Barcode IDs,R1_file49,R1_file33,R1_file17,R1_file50,R1_file18,R1_file46,R1_file41,R1_file34,...,R1_file37,R1_file13,R1_file14,R1_file45,R1_file28,R1_file44,R1_file27,R1_file39,R1_file43,R1_file36
0,TACGTCATTAAGAGTTCAAC,NO_CURRENT_2,1342,151,480,889,735,837,620,340,...,883,552,750,1538,484,112,396,253,207,210
1,GATTCATACTAAACACTCTA,NO_CURRENT_3,1325,197,594,801,650,121,559,248,...,1336,394,812,102,864,302,574,220,163,315
2,ATCTTCTCGACGAAAATGCG,NO_CURRENT_4,1691,374,839,1514,1313,941,367,475,...,1091,810,1315,918,1264,364,530,432,261,499
3,GAGCTGGACGGCGACGTAAA,NO_CURRENT_5,9022,3521,3075,13920,2156,2687,7891,5345,...,8256,1283,1524,4216,7311,2847,2190,3581,5700,2549
4,AATTGTCTTGTCCCTATCGA,NO_CURRENT_6,1438,926,1444,1475,1215,482,640,1078,...,1427,888,1087,593,1050,371,1548,904,568,427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,CACATTTGATTGCTTCCTCT,EIF2S1_2,1497,755,937,1457,952,408,759,423,...,1150,705,944,528,883,449,790,709,462,970
23996,CACACTCATTCCTGCCAATT,EIF2S1_3,1095,230,615,1218,785,826,1507,685,...,1056,658,913,807,1411,2452,424,523,223,633
23997,CCGGTGGCAACCCTTAGGGC,NELFE_0,1026,543,829,960,456,93,787,418,...,886,365,503,155,676,228,576,279,262,410
23998,GGCAACCCTTAGGGCTGTTC,NELFE_1,1436,661,1266,1423,998,328,671,739,...,999,595,743,516,704,198,780,623,945,995


In [13]:
# save results
combined_res.to_csv(os.path.join(output_folder, 'fig1_poolQ_result.csv'))