In [3]:
import numpy as np
import scanpy as sc
import pandas as pd
import os
from collections import defaultdict

In [3]:
# Load DeepCycle data
adata = sc.read_h5ad("/shared/space2/molina/Data/mESCs_2iLIF/SRR13790993/deepcycle/deepcycle.h5ad")

# Extract 10x-style barcode (after last colon) and theta
barcodes = adata.obs.index.str.extract(r':([ACGT]+)x')[0]
theta_vals = adata.obs['cell_cycle_theta'].round(2).astype(str)

# Build barcode-to-theta-bin map
barcode_theta_map = pd.DataFrame({
    'barcode': barcodes,
    'theta_bin': theta_vals.values
})
barcode_theta_map.to_csv("barcode_theta_map.tsv", sep='\t', index=False)

  utils.warn_names_duplicates("var")


In [None]:
# === USER INPUT ===
fastq_r1 = "/shared/space2/molina/Data/mESCs_2iLIF/SRR13790993/fastqs/SRR13790993_S1_R1_001.fastq"
fastq_r2 = "/shared/space2/molina/Data/mESCs_2iLIF/SRR13790993/fastqs/SRR13790993_S1_R2_001.fastq"
barcode_theta_file = "barcode_theta_map.tsv"  # two columns: barcode \t theta_bin (e.g. 0.34)
output_dir = "/shared/space2/molina/suttyg/theta_binned_fastqs"
# ===================

# Load barcode-to-theta-bin mapping
df = pd.read_csv(barcode_theta_file, sep="\t")
barcode_to_bin = dict(zip(df['barcode'], df['theta_bin'].astype(str)))

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Create writers per bin on demand
r1_writers = {}
r2_writers = {}

def get_writer(bin_label, read_type):
    file_path = os.path.join(output_dir, f"theta_{bin_label}_{read_type}.fastq")
    return open(file_path, "a")

# Helper to extract barcode from read header
def extract_barcode(read_sequence_line):
    # Assume first 16 bp of R1 is the barcode
    return read_sequence_line[:16].replace("N", "")  # Strip Ns if present

# Preprocess barcode_to_bin into:
exact_map = {}
suffix_map = defaultdict(list)
prefix_map = defaultdict(list)

for key, val in barcode_to_bin.items():
    exact_map[key] = val
    if len(key) > 5:
        suffix_map[key[-16:]].append(val)
        prefix_map[key[:16]].append(val)

def match_theta_bin(barcode):
    if barcode in exact_map:
        return exact_map[barcode]
    if barcode in suffix_map and len(suffix_map[barcode]) == 1:
        return suffix_map[barcode][0]
    if barcode in prefix_map and len(prefix_map[barcode]) == 1:
        return prefix_map[barcode][0]
    return None


# Get file size in bytes
total_size = os.path.getsize(fastq_r1)

count = 0
success = 0
last_percent = -1

# Main loop
with open(fastq_r1, "r") as r1, open(fastq_r2, "r") as r2:
    while True:
        r1_entry = [r1.readline() for _ in range(4)]
        r2_entry = [r2.readline() for _ in range(4)]

        if not r1_entry[0] or not r2_entry[0]:
            break  # End of file
        
        if (count % 6000000 == 0):
            print("Checking progress...")
            current_pos = r1.tell()
            percent = int((current_pos / total_size) * 100)
            if percent != last_percent:
                print(f"Progress: {percent}% ({success}/{count} success matches)")
                last_percent = percent

        barcode = extract_barcode(r1_entry[1].strip())  # Use sequence line, not header

        theta_bin = match_theta_bin(barcode)
        if theta_bin:
            success += 1
            if theta_bin not in r1_writers:
                #Create a file for new theta
                r1_writers[theta_bin] = get_writer(theta_bin, "R1")
                r2_writers[theta_bin] = get_writer(theta_bin, "R2")
            #Add lines in existing file
            r1_writers[theta_bin].writelines(r1_entry)
            r2_writers[theta_bin].writelines(r2_entry)
        #else:
        #    print("Error", barcode)
        count += 1
 
# Close files
for writer in r1_writers.values():
    writer.close()
for writer in r2_writers.values():
    writer.close()

print("✅ FASTQ splitting by theta bin completed.")

Checking progress...
Progress: 0% (0/0 success matches)
Checking progress...
Checking progress...
Progress: 1% (10248037/12000000 success matches)
Checking progress...
Progress: 2% (15363078/18000000 success matches)
Checking progress...
Progress: 3% (20481812/24000000 success matches)
Checking progress...
Progress: 4% (25560230/30000000 success matches)
Checking progress...
Progress: 5% (30698922/36000000 success matches)


In [5]:
import gzip
import shutil

theta_smooth = np.round(np.linspace(0.06, 1.00, 95), 2)  # 100 bins from 0.01 to 1.00

for theta in theta_smooth:
    for pair_end in ["R1", "R2"]:
        # Input and output paths
        input_fastq = "/shared/space2/molina/suttyg/theta_binned_fastqs/theta_"+str(theta)+"_"+pair_end+".fastq"
        output_fastq_gz = "/shared/space2/molina/suttyg/theta_binned_fastqs_gz/theta_"+str(theta)+"_"+pair_end+".fastq.gz"

        # Compress FASTQ file
        with open(input_fastq, 'rb') as f_in:
            with gzip.open(output_fastq_gz, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        print("✅ Compression complete.")

✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.
✅ Compression complete.


KeyboardInterrupt: 

In [5]:
#Renommer avec le bon format
theta_smooth = np.round(np.linspace(0.01, 1.00, 100), 2)  # 100 bins from 0.01 to 1.00

for theta in theta_smooth:
    for pair_end in ["R1", "R2"]:
        # Input and output paths
        old_name = "/shared/space2/molina/suttyg/theta_binned_fastqs_gz/theta_"+str(theta)+"_"+pair_end+".fastq.gz"
        new_name = "/shared/space2/molina/suttyg/theta_binned_fastqs_gz/sample"+str(int(theta*100))+"_"+pair_end+".fastq"

        os.rename(old_name, new_name)