init version 2 from Wits

h3abionet · Jan 31, 2018 · 88fdcea · 88fdcea
commit 88fdcea
Show file tree

Hide file tree

Showing 92 changed files with 11,169 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+.git/
+.nextflow*
+.idea*
+work/
+output/*
+input/*
+/config-gen/nbproject/private/
+/config-gen/build/
diff --git a/README.md b/README.md
diff --git a/aux/check.py b/aux/check.py
@@ -0,0 +1,15 @@
+
+
+
+import pandas as pd
+
+
+m = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3.csv",skiprows=7,usecols=["Name","IlmnStrand","RefStrand"],delimiter=",",dtype={"Chr":str})
+#s = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3_StrandReport_FT.txt",usecols=["SNP_Name","Forward_Allele1","Top_AlleleA"],delim_whitespace=True,comment="#",dtype={"Chr":str})
+
+xx = (m[(m['RefStrand']=="+")&(m['IlmnStrand']=="TOP")])["Name"]
+for snp in xx.values:
+    print(snp)
+
+
+
diff --git a/aux/fake.sh b/aux/fake.sh
@@ -0,0 +1,4 @@
+head -n 8 $manifest > m.csv
+grep $1 $manifest >> m.csv
+head -n 6 $strand   > s.csv
+grep $1 $strand >> s.csv
diff --git a/aux/make_ref.py b/aux/make_ref.py
@@ -0,0 +1,178 @@
+
+from __future__ import print_function
+
+import pandas as pd
+from   Bio import SeqIO
+import glob
+import re
+import os
+import sys
+import gzip
+import distance
+import argparse
+
+
+TAB=chr(9)
+EOL=chr(10)
+
+def parseArguments():
+    parser=argparse.ArgumentParser()
+    parser.add_argument('reference_genome_dir', type=str, metavar='referencegenomedir'),
+    parser.add_argument('strand_report', type=str, metavar='strand_report'),
+    parser.add_argument('chip_manifest', type=str, metavar='chip_manifest'),
+    parser.add_argument('output', type=str, metavar='output',help="output report"),
+    parser.add_argument('--chrom-only',type=str,default=False,dest="chrom_only")
+    parser.add_argument('--seg-len', type=int, dest='seg_len', metavar='seg_len',\
+                        default = 25, help="how much matches"),
+    args = parser.parse_args()
+    return args
+
+
+def getRef(path,build):
+    match   = os.path.join(path,str(build),"genomic","*fa.gz")
+    all_files = glob.glob(match)
+    genome = {}
+    for fname in all_files:
+        m = re.search(".*.chromosome\.(\w+)\.fa.gz",fname)
+        if not m:
+            sys.exit("Illegal file "+fname)
+        chrom = m.group(1)
+        if chrom not in chroms: continue
+        print(chrom,end="\t")
+        seq = SeqIO.read(gzip.open(fname,"rt"),"fasta")
+        genome[chrom]=seq
+    print()
+    return genome
+
+
+def getData(directory, strand_fn, manifest_fn):
+    genome = [None]*40
+
+    for build in [36,37]:
+        print("Reading in build ",build)
+        genome[build]  = getRef(directory,build)
+
+    mf     = pd.read_csv(manifest_fn,delimiter=",",dtype={"Chr":str}, skiprows=7)
+    strand = \
+      pd.read_csv(strand_fn,delim_whitespace=True,dtype={"Chr":str}, comment="#")
+    return (genome, mf, strand)
+
+
+comp = {'A':'T','C':'G','G':'C','T':'A'}
+
+
+def rc(seq):
+    m = ""
+    for x in seq:
+        m = comp.get(x,x)+m
+    return m
+
+
+def match(ref_left,probe_left,ref_right,probe_right,RC,dist):
+    if RC:
+        tmp = probe_left
+        probe_left  = rc(probe_right)
+        probe_right = rc(tmp)
+    pl_len = len(probe_left)
+    pl_rgt = len(probe_right)
+    pleft  = probe_left[-min(seg_len,pl_len):]
+    pright = probe_right[:min(seg_len,pl_rgt)]
+    rleft  = ref_left[-min(seg_len,pl_len):].seq
+    rright = ref_right[:min(seg_len,pl_rgt)].seq
+    try:
+       d1 = dist(rleft,pleft)
+       d2 = dist(rright,pright)
+    except ValueError:
+       print("\nString length mismatch error,i,",snp["SNP_Name"])
+       print("LEFT:  ",rleft,len(rleft),pleft,len(pleft),"RC=",RC)
+       print("RIGHT: ",rright,len(rright),pright,len(pright))
+       return 50
+    return d1+d2
+
+
+def  warn(warnf,chrom_num,coord,snp,direc,score):
+    if score>4:
+       warnf.write(TAB.join(map(str,["WARN:",chrom_num,coord,snp,direc,score]))+EOL)
+
+def alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,dist=distance.hamming):
+    seq       = chromosome[coord-seg_len:coord+seg_len+1]
+    base      = seq.seq[seg_len]
+    ref_pre   = seq[0:seg_len]
+    ref_post  = seq[seg_len+1:]
+    fwd       = match(ref_pre,probe_pre,ref_post,probe_pst,False,dist)
+    rev       = 2000
+    score     = 1000
+    align     = "-"
+    if fwd<10:
+       align="fwd"
+       score=fwd
+       warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"+",fwd)
+    else:
+       rev=match(ref_pre,probe_pre,ref_post,probe_pst,True,dist)
+       if rev<10:
+          align="rev"
+          score=rev
+          warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"-",rev)
+    if the_snp in  [ "[D/I]", "[I/D]"]:
+       align="fwd"
+       base ="I"
+    return align, base, score, fwd, rev
+
+
+args = parseArguments()
+if args.chrom_only:
+    chroms = [ args.chrom_only ]
+else:
+    chroms = list(map(str, range(1,23)))+['X','Y','MT']
+
+(genome,mf,strand) = getData(args.reference_genome_dir,args.strand_report,args.chip_manifest)
+
+
+
+
+g       = open(args.output+".ref","w")
+warnf   = open(args.output+".wrn","w")
+errf    = open(args.output+".err","w")
+seg_len = args.seg_len
+for i,snp in strand.iterrows():
+    align      = "fwd"
+    base       = snp["Top_AlleleA"] # if can't do better
+    score      = -1
+    coord      = snp["Coord"]-1
+    build      = int(snp['Build'])
+    snp_name   = snp["SNP_Name"]
+    the_snp    = mf.loc[i]["SNP"]
+    chrom_num  = snp["Chr"]
+    if genome[build] == None:
+        output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"build err"]))+EOL
+        errf.write(output)
+        output = TAB.join(map(str, [snp["SNP_Name"],chrom_num,coord+1,base,align]))+EOL
+        g.write(output)
+        continue
+    if chrom_num not in genome[build]:
+        output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"chrom_num_err"]))+EOL
+        errf.write(output)
+        output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
+        g.write(output)
+        continue
+    top_seq = snp["Top_Seq"].upper()
+    top_pre = top_seq.index("[")
+    top_pst = top_seq.index("]")+1
+    alleles = top_seq[top_pre+1:top_pst-1].split("/")
+    probe_pre = top_seq[:top_pre]
+    probe_pst = top_seq[top_pst:]
+    chromosome = genome[build]["X" if chrom_num == "XY" else chrom_num]
+    align, base, score, fwd, rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst)
+    if score==1000: 
+        align, base, score, fwd,rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,distance.levenshtein)
+        if score<10:
+           warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"BM",min(fwd,rev))
+        else:
+           output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"non_align",fwd,rev]))+EOL         
+           errf.write(output)
+    output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
+    g.write(output)
+g.close()
+errf.close()
+warnf.close()
+
diff --git a/aux/updateFam.py b/aux/updateFam.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import sys
+import re
+
+def parseArguments():
+    parser=argparse.ArgumentParser()
+    parser.add_argument('samplesheet', type=str, metavar='samplesheet'),
+    parser.add_argument('updatesheet', type=str, metavar='updatesheet'),
+    parser.add_argument('oldfam', type=str, metavar='oldfam'),
+    parser.add_argument('output', type=str, metavar='output'),    
+    args = parser.parse_args()
+    if "%s.fam"%args.output == args.oldfam:
+        sys.exit("New and old fam cannot be the same")
+    return args
+
+EOL=chr(10)
+PIDS = ['FID','IID']
+
+def getFam(famf):
+   fam =  pd.read_csv(famf,delim_whitespace=True,header=None,\
+                             names=["FID","IID","FAT","MAT","SEX","PHE"])
+   old = ['OFID','OIID']
+   fam['OFID']=fam['FID']
+   fam['OIID']=fam['IID']
+   fam.set_index(old, inplace=True)
+   oldfam = fam.copy(deep=True)
+   return oldfam,fam
+
+def getSmplLbl(data):
+    m=re.search(".*_(\w+)",data["Institute Sample Label"])
+    if not m: sys.exit("Failed parsing "+data)
+    return m.group(1)
+
+
+
+args = parseArguments()
+oldfam,fam = getFam(args.oldfam)
+
+orig = pd.read_excel(args.samplesheet)
+orig.set_index(["Institute Plate Label","Well"],inplace=True)
+orig['PID']=orig.apply(getSmplLbl,axis=1)
+update = pd.read_excel(args.updatesheet,skiprows=3)
+count=0
+g=open("%s.errs"%args.output,"w")
+h=open("%s.switch"%args.output,"w")
+
+for i, row in update.iterrows():
+    if "IlluminaControl" in row["Institute Sample Label"]: continue
+    new_lbl=getSmplLbl(row)
+    pos      = tuple(row[['Institute Plate Label','Well']].values)
+    oldid    = getSmplLbl(orig.loc[pos])
+    if oldid != new_lbl:
+        h.write("%s  -> %s \n"%(oldid,new_lbl))
+        count=count+1
+        if fam.index.contains((new_lbl,new_lbl)):
+            fam.loc[(oldid,oldid),["FID","IID"]]   = new_lbl
+            fam.loc[(oldid,oldid),["FAT","MAT","SEX","PHE"]]   = oldfam.loc[(new_lbl,new_lbl),["FAT","MAT","SEX", "PHE"]]
+        else:
+            g.write(new_lbl+EOL)
+            continue
+    if "unknown" in row["Institute Sample Label"]: 
+        print("Pos=<%s>; Old =<%s>; New=<%s>"%(pos,oldid,new_lbl))
+        print(fam.loc[(oldid,oldid)]['IID'])
+
+
+g.close()    
+h.close()
+all_ids = fam[PIDS].to_records(index=False).tolist()
+uniq =set([])
+orig.set_index('PID',inplace=True)
+for x in all_ids:
+    if x in uniq:
+        print("Duplicated element ",x[0],orig.loc[x[0],"Institute Sample Label"])
+    else:
+        uniq.add(x)
+
+fam.to_csv("%s.fam"%args.output,sep="\t",header=None,index=False),
diff --git a/bin/qc1logextract.py b/bin/qc1logextract.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+from __future__ import print_function
+
+
+import sys
+import re
+
+f = open(sys.argv[1])
+rem_name=sys.argv[2]
+rem_inds = rem_snps = rem_maf = rem_hwe =  0
+
+autosome=False
+nonautosome = 0
+for line in f:
+    if "Options in effect" in line: 
+        autosome=False
+        continue
+    if "--autosome" in line:
+        autosome=True
+        continue
+    m =re.search("(\w+) out of (\w+) variants loaded from .bim file.", line)
+    if m and autosome:
+        nonautosome = int(m.group(2))-int(m.group(1))
+        continue
+    m=re.search("(\w+) (\w+) removed due to ([-A-z0-9]+ \w+)",line)
+    if m:
+        #print(m.group(1),m.group(2),m.group(3))
+        n = int(m.group(1))
+        if m.group(2)=="variants":
+            if  m.group(3)=="missing genotype":
+                rem_snps = rem_snps+n
+            elif m.group(3)=="Hardy-Weinberg exact":
+                rem_hwe = rem_hwe+n
+            else:
+                rem_maf = rem_maf + n
+        else:
+            rem_inds = rem_inds + n
+
+text = """
+*-noindent
+Using this approach, 
+*-begin{itemize}
+*-item %d SNPs that are non-autosomal were removed;
+*-item %d SNPs were removed due missing genotype threshold constraints;
+*-item %d individuals were removed due to missing genotype constraints (the list of missing individuals, if any, can be found in the file *-url{%s});
+*-item %d SNPs were removed as the MAF was too low.
+*-item %d SNPs were removed as they were out of the specified Hardy-Weinberg equilibrium. 
+*-end{itemize}
+""" %(nonautosome,rem_snps,rem_inds,rem_name,rem_maf,rem_hwe)
+
+print(text.replace("*-",chr(92)).replace("##",chr(36)))