In [3]:
#!/usr/bin/env python

import re
import sys
import string
import argparse
import operator
import pandas as pd
import xgboost as xgb
import numpy as np

VERSION='0.1.0'

parser = argparse.ArgumentParser(description="""

DESCRIPTION

EXAMPLE:

    """, formatter_class= argparse.RawTextHelpFormatter)

parser.add_argument('--fasta', '-f',
                   type= str,
                   help='''Input fasta file to search. Use '-' to read the file from stdin.
                                   
                   ''',default='./seq.fasta',
                   required= True)

parser.add_argument('--classifier',
                   required= False,
                   default= 'G4Boost_classifier.json',
                   help='''Use specified classifier (G4Boost_classifier.sav)
                   ''')
parser.add_argument('--regressor',
                   required= False,
                   default= 'G4Boost_regressor.json',
                   help='''Use specified classifier (G4Boost_regressor.sav)
                   ''')
parser.add_argument('--maxloop', '-N',
                   type= int,
                   required= False,
                   default= 12,
                   help='''Maximum length of the loop. Default is to report up to 12nt.
                   ''')
parser.add_argument('--minloop', '-n',
                   type= int,
                   required= False,
                   default= 1,
                   help='''Minimum length of the loop. Default is to report up to 1nt.
                   ''')
parser.add_argument('--maxG', '-G',
                   type= int,
                   required= False,
                   default= 7,
                   help='''Maximum number of consecutive G bases within a G-stem. Default is to report up to 7 Gs.
                   ''')
parser.add_argument('--minG', '-g',
                   type= int,
                   required= False,
                   default= 2,
                   help='''Maximum number of consecutive G bases within a G-stem. Default is to report up to 1 Gs.
                   ''')
parser.add_argument('--loops', '-l',
                   type= int,
                   required= False,
                   default= 11,
                   help='''Maximum number of flexible loops separating the G-stems. Default is to report up to 11 Gs.
                   ''')

parser.add_argument('--noreverse',
                   action= 'store_true',
                   help='''Do not search the reverse complement of the input fasta.
                   ''')

parser.add_argument('--quiet', '-q',
                   action= 'store_true',
                   help='''Do not print progress report (i.e. sequence names as they are scanned).                                   
                   ''')

parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION)
args = parser.parse_args()


" ------------------------------[  Functions ]--------------------------------- "

def sort_table(table, cols):
    for col in reversed(cols):
        table = sorted(table, key=operator.itemgetter(col))
    return(table)


def chrom_name(header):
    if not header.startswith('>'):
#        raise Exception('FASTA header does not start with ">":\n%s' % header)
        return 'noID'
    chr= re.sub('^>\s*', '', header)
    chr= re.sub('\s.*', '', chr)
    return chr

def revcomp(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A', 'N': 'N'}
    return "".join(complement.get(base, base) for base in reversed(seq))


def findall(seq, search):
    count=-1
    loc= 0
    newloc=0
    while newloc > -1:
        newloc=seq[loc:].find(search)
        loc=loc+newloc+1
        count+=1
    return count

def initialize_dataFrame():
    header=["seq", "seq_length", "g4motif", 'maxgbase', 'maxgstem', "length", "maxlbase", "minlbase", "G", "C", "GG", "CC"]
    data_dict={}
    for h in header:
        data_dict[h]=[]
    return data_dict

def topology(reg, seq):
    split_seq=re.split(reg, seq)
    if len(split_seq[-1])==0: gstem_base=split_seq[-2]
    else: gstem_base=split_seq[-1]
    g=len(gstem_base)
    loops=[len(sp_seq)-g for sp_seq in split_seq]
    loops=[lbase for lbase in loops if lbase>0]
    maxlbase=max(loops)
    minlbase=min(loops)
    test=gstem_base
    for sp_seq in split_seq:
        if len(sp_seq)>g:
            test+=sp_seq[g:].lower()
            test+=gstem_base
    return [test, len(test), len(loops)+1, g, maxlbase, minlbase]

def update_dataFrame(features, reg, seq, ref):
    [test, length, maxgstem, maxgbase, maxlbase, minlbase] = topology(reg, seq)
    features['g4motif'].append(test)
    features['length'].append(length)
    features['seq_length'].append(len(ref))
    features['maxgstem'].append(maxgstem)
    features['maxgbase'].append(maxgbase)
    features['maxlbase'].append(maxlbase)
    features['minlbase'].append(minlbase)
    features['G'].append(int(findall(ref,'G')*100/len(ref)))
    features['GG'].append(int(findall(ref,'GG')*100/len(ref)))
    features['C'].append(int(findall(ref,'C')*100/len(ref)))
    features['CC'].append(int(findall(ref,'CC')*100/len(ref)))
    return features


def findmotifs(reg, seq, start):
    gquad_list=[]
    for m in re.finditer(reg, seq):
        seq= m.group(0)
        quad_id= chrom + '_' + str(m.start()+start) + '_' + str(m.end()+start)
        gquad_list.append([chrom, m.start()+start, m.end()+start, quad_id, len(m.group(0)), '+', seq])
    return gquad_list
# -----------------------------------------------------------------------------


if args.fasta != '-':
    ref_seq_fh= open(args.fasta)
    output= args.fasta+'.gff'
else:
    ref_seq_fh= sys.stdin
    output='G4Boost_quadruplexes.gff'

# ref_seq=[]
# line= ref_seq_fh.readline()
# if chrom != 'noID': line= ref_seq_fh.readline()
# else: chrom = line.strip()




In [4]:

import re
import sys
import string
import argparse
import operator
import pandas as pd
import xgboost as xgb
import numpy as np

def predict(seqs,):
    gquad_list= []
# eof= False



#if args.fasta != '-': output= args.fasta+'.gff'
#else: output = 'G4Boost_quadruplexes.gff'
#out=open(output, 'w')


    sys.stderr.write('Starting stability prediction!\n\n')
    regressor = xgb.XGBRegressor()
    classifier = xgb.XGBClassifier()
    regressor.load_model(args.regressor)
    classifier.load_model(args.classifier)
    preds = []
    all_features = []
    for i in range(len(seqs)):
        chrom = str(i)
        gb=range(args.minG, args.maxG+1)[::-1]
        gs=range(3, args.loops+1)[::-1]
        longest = (args.maxG + args.maxloop) * args.loops + args.maxG
        features=initialize_dataFrame()
        ref_seq = seqs[i]
        # ref_seq= ''.join(ref_seq)
        ref_seq=ref_seq.upper().replace('U', 'T')
        rev_ref_seq=revcomp(ref_seq)
        seqlen= len(ref_seq)
        for g in gb:
            for s in gs:
                gstem_base=''
                for i in range(g): gstem_base+="G"
                reg=""
                for i in range(s): reg+='([gG]{%d}\w{%d,%d})' % (g , args.minloop, args.maxloop)
                reg+='([gG]{%d})' % (g)
                for m in re.finditer(reg, ref_seq):
                    seq= m.group(0)
                    start=m.start()
                    end=m.end()
                    if len(ref_seq) > longest: ref = seq
                    else: ref = ref_seq
                    quad_id= chrom + '_' + str(m.start()) + '_' + str(m.end())
                    gquad_list.append([chrom, start, end, quad_id, len(seq), '+', seq])
                    if seq not in features['g4motif']:
                        features = update_dataFrame(features, reg, seq, ref)
                        features['seq'].append(chrom)
                    temp=''
                    for i in range(start,end): temp+='N'
                    ref_seq=ref_seq[:start]+temp+ref_seq[end:]
                if args.noreverse is False:
                    for m in re.finditer(reg, rev_ref_seq):
                        seq= m.group(0)
                        start=m.start()
                        end=m.end()
                        if len(rev_ref_seq) > longest: ref = seq
                        else: ref = rev_ref_seq
                        quad_id= chrom + '_' + str(m.start()) + '_' + str(m.end())
                        gquad_list.append([chrom, seqlen-end, seqlen-start, quad_id, len(seq), '-', seq])
                        if seq not in features['g4motif']:
                            features = update_dataFrame(features, reg, seq, ref)
                            features['seq'].append(chrom)
                        temp=''
                        for i in range(start,end): temp+='N'
                        rev_ref_seq=rev_ref_seq[:start]+temp+rev_ref_seq[end:]
                gquad_sorted= sort_table(gquad_list, (1,2,3))
                gquad_list= []
                for xline in gquad_sorted:
                    xline= '\t'.join([str(x) for x in xline])
                    with open(output, 'a') as out: out.write(xline+'\n')


    #---------------




        selected=["seq_length", "length", "maxgstem" ,"maxgbase", "maxlbase", "minlbase", "G", "C", "GG", "CC"]
        # print(features)
        # del features['G-quartet']
        # del features['loops']
        features=pd.DataFrame.from_dict(features)
        X_test = features[selected]

        X_test.columns = ["length", "len", "maxgstem", "maxgbase", "maxlbase", "minlbase", "G", "C", "GG", "CC"]

        # print(X_test)


        # X_test=xgb.DMatrix(X_test)
        g4_pred=classifier.predict(X_test)
        if len(g4_pred) < 1:
            preds.append(0)
        else:
            g4_pred_proba=classifier.predict_proba(X_test)[:, 1]
            mfe_pred = regressor.predict(X_test)
            if np.max(g4_pred_proba) < 0.5:
                preds.append(0)
            else:
                preds.append(1)
        # features['g4_pred']=g4_pred
        # features['g4_prob']=g4_pred_proba
        # features['mfe_pred']=mfe_pred
        # features['maxgstem']=[l-1 for l in features['maxgstem']]

        # if args.fasta != '-': output= args.fasta+'.g4scores.csv'
        # else: output = 'G4Boost_quadruplexes.g4.csv'
        # features.to_csv(output,sep='\t',index=False)


    # print(float(np.sum(preds))/len(preds))
    return preds

In [5]:
from Bio import SeqIO
import pandas as pd
import sys

# read generated
lines = []
for record in SeqIO.parse('/data4/sina/UTR/MEME/1024_10000_init.fasta','fasta'):
    lines.append(str(record.seq))

mut_inits = []
for i in range(len(lines)):
    mut_inits.append(lines[i].replace('\n','')[:50])


# read optimus
optimus = []
for record in SeqIO.parse('/data4/sina/UTR/MEME/optimus_fasta.fasta','fasta'):
    optimus.append(str(record.seq))


# read generated
inits = []
for record in SeqIO.parse('/data4/sina/UTR/MEME/1024_10000_init.fasta','fasta'):
    inits.append(str(record.seq))

opts = []
for record in SeqIO.parse('/data4/sina/UTR/MEME/1024_10000_opt.fasta','fasta'):
    opts.append(str(record.seq))


# read natural:
df = pd.read_csv('./../UTRGAN/data/utrdb2.csv')
df = df['seq'].to_numpy()
nats = []
for i in range(len(df)):
    if  len(df[i])< 129 and len(df[i]) > 64:
        nats.append(df[i].replace('T','U'))

In [6]:
nat_pred = predict(nats)
init_pred = predict(inits)
opt_pred = predict(opts)
optimus_pred = predict(optimus)
mut_init_pred = predict(mut_inits)

Starting stability prediction!

Starting stability prediction!

Starting stability prediction!

Starting stability prediction!

Starting stability prediction!



In [7]:
print(np.sum(nat_pred)/len(nat_pred)*1024.)
print(np.sum(init_pred))
print(np.sum(opt_pred))
print(np.sum(optimus_pred))
print(np.sum(mut_init_pred))

236.76727819548873
209
199
16
162
