Skip to content

Commit

Permalink
Merge pull request #7 from cribbslab/AC_10x
Browse files Browse the repository at this point in the history
Ac 10x
  • Loading branch information
Acribbs committed Oct 27, 2023
2 parents 165c5a0 + 3985134 commit 6da0089
Showing 1 changed file with 48 additions and 107 deletions.
155 changes: 48 additions & 107 deletions tallytrin/python/10x_identify_barcode.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,57 @@

import sys
import regex
import cgatcore.iotools as iotools
import pysam
import logging
import argparse
import Bio
import Bio.pairwise2
from Bio.pairwise2 import format_alignment

# ########################################################################### #
# ###################### Set up the logging ################################# #
# ########################################################################### #
import cgatcore.iotools as iotools
import pysam

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
L = logging.getLogger("identify_perfect_nano.py")


# ########################################################################### #
# ######################## Parse the arguments ############################## #
# ########################################################################### #

parser = argparse.ArgumentParser()
parser.add_argument("--whitelist", default=None, type=str,
help='name of whitelist file')
parser.add_argument("--infile", default=None, type=str,
help='nanopore infile fastq file')
parser.add_argument("--outfile", default=None, type=str,
help='name for output fastq files')
parser.add_argument("--cmimode", default=None, type=str,
help='Run the script in cmi mode for accuracy evaluation')
parser.add_argument("--barcode", default='16', type=int,
help='Length of barcode required')
parser.add_argument("--umi", default='12', type=int,
help='Length of umi required')

args = parser.parse_args()

L.info("args:")
print(args)

# ########################################################################### #
# ######################## Code ############################## #
# ########################################################################### #

log = iotools.open_file(args.outfile + ".log","w")

# generate set of barcodes for whitelist
barcodes = []

outfile = open(args.outfile, "w")

n = 0
y = 0

with pysam.FastxFile(args.infile) as fh:

for record in fh:
n+=1

first = 0

seq = record.sequence

for a, b in zip(Bio.pairwise2.align.localms(seq,"AGATCGGAAGAGCGT", 2, -1, -0.5, -0.1), Bio.pairwise2.align.localms(seq,"AAAAAAAAA", 2, -1, -0.5, -0.1)):
first +=1
if first == 1:
al1_a, al2_a, score_a, begin_a, end_a = a
al1_a, al2_b, score_b, begin_b, end_b = b
else:
first = 0
break
length_umibarcode = len(record.sequence[end_b:begin_a])

if length_umibarcode >= 20:
y+=1
if args.cmimode == '1':
umi = record.sequence[begin_a:end_a]
umi = umi[:15]
else:
umi_start = int(16 + args.umi)
print(umi_start)
umi = record.sequence[begin_a-umi_start:begin_a-16]
print(umi)
if len(umi) == 15 and args.cmimode == '1':
barcode = record.sequence[begin_a-16:begin_a][:int(args.barcode)]
barcodes.append(barcode)
seq_new = record.sequence[:begin_b]
quality_new = record.quality[:begin_b]

outfile.write("@%s\n%s\n+\n%s\n" % (record.name + "_" + barcode + "_" + umi, seq_new, quality_new))
else:
barcode = record.sequence[begin_a-16:begin_a][:int(args.barcode)]
L = logging.getLogger("10x_identify_barcode.py")

def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--whitelist", default=None, type=str, help='Name of whitelist file')
parser.add_argument("--cmimode", default=0, type=str, help='Run CMI for accuracy evaluation')
parser.add_argument("--infile", default=None, type=str, help='Nanopore infile FASTQ file')
parser.add_argument("--outfile", default=None, type=str, help='Name for output FASTQ files')
parser.add_argument("--barcode", default=16, type=int, help='Length of barcode required')
parser.add_argument("--umi", default=12, type=int, help='Length of UMI required')
return parser.parse_args()

def main():
args = parse_arguments()
L.info("Arguments:")
L.info(args)

log = iotools.open_file(f"{args.outfile}.log", "w")
barcodes = []
n, y = 0, 0

with pysam.FastxFile(args.infile) as fh, open(args.outfile, "w") as outfile:
for record in fh:
n += 1
seq = record.sequence
position = seq.find("AGATCGGAAGAGCGT")

if position != -1:
umi_start = position - (args.barcode + args.umi)
umi_end = position - args.barcode
umi = seq[umi_start:umi_end]
barcode = seq[position - args.barcode:position]

if len(umi) == args.umi and len(barcode) == args.barcode:
y += 1
barcodes.append(barcode)
seq_new = record.sequence[:begin_b]
quality_new = record.quality[:begin_b]

outfile.write("@%s\n%s\n+\n%s\n" % (record.name + "_" + barcode + "_" + umi, seq_new, quality_new))

outfile.close()


# Write out a list of whitelist barcodes
out_barcodes = open(args.whitelist,"w")
new_seq = seq[:position - (args.barcode + args.umi)]
new_quality = record.quality[:position - (args.barcode + args.umi)]
outfile.write(f"@{record.name}_{barcode}_{umi}\n{new_seq}\n+\n{new_quality}\n")

for i in set(barcodes):
out_barcodes.write("%s\n" % (i))
out_barcodes.close()

with open(args.whitelist, "w") as out_barcodes:
for barcode in set(barcodes):
out_barcodes.write(f"{barcode}\n")

log.write("The number of total reads: %s\n" %(n))
log.write("The number of total reads that have a correct barcode and UMI: %s\n" %(y))
log.write("The number of total recovered percent is: %s\n" %((y/n)*100))
log.write(f"The number of total reads: {n}\n")
log.write(f"The number of reads that have a correct barcode and UMI: {y}\n")
log.write(f"The percentage of total recovered reads is: {((y / n) * 100):.2f}\n")

log.close()
if __name__ == "__main__":
main()

0 comments on commit 6da0089

Please sign in to comment.