Skip to content

Commit

Permalink
init version 2 from Wits
Browse files Browse the repository at this point in the history
  • Loading branch information
shaze committed Jan 31, 2018
0 parents commit 88fdcea
Show file tree
Hide file tree
Showing 92 changed files with 11,169 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
@@ -0,0 +1,8 @@
.git/
.nextflow*
.idea*
work/
output/*
input/*
/config-gen/nbproject/private/
/config-gen/build/
749 changes: 749 additions & 0 deletions README.md

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions aux/check.py
@@ -0,0 +1,15 @@



import pandas as pd


m = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3.csv",skiprows=7,usecols=["Name","IlmnStrand","RefStrand"],delimiter=",",dtype={"Chr":str})
#s = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3_StrandReport_FT.txt",usecols=["SNP_Name","Forward_Allele1","Top_AlleleA"],delim_whitespace=True,comment="#",dtype={"Chr":str})

xx = (m[(m['RefStrand']=="+")&(m['IlmnStrand']=="TOP")])["Name"]
for snp in xx.values:
print(snp)



4 changes: 4 additions & 0 deletions aux/fake.sh
@@ -0,0 +1,4 @@
head -n 8 $manifest > m.csv
grep $1 $manifest >> m.csv
head -n 6 $strand > s.csv
grep $1 $strand >> s.csv
178 changes: 178 additions & 0 deletions aux/make_ref.py
@@ -0,0 +1,178 @@

from __future__ import print_function

import pandas as pd
from Bio import SeqIO
import glob
import re
import os
import sys
import gzip
import distance
import argparse


TAB=chr(9)
EOL=chr(10)

def parseArguments():
parser=argparse.ArgumentParser()
parser.add_argument('reference_genome_dir', type=str, metavar='referencegenomedir'),
parser.add_argument('strand_report', type=str, metavar='strand_report'),
parser.add_argument('chip_manifest', type=str, metavar='chip_manifest'),
parser.add_argument('output', type=str, metavar='output',help="output report"),
parser.add_argument('--chrom-only',type=str,default=False,dest="chrom_only")
parser.add_argument('--seg-len', type=int, dest='seg_len', metavar='seg_len',\
default = 25, help="how much matches"),
args = parser.parse_args()
return args


def getRef(path,build):
match = os.path.join(path,str(build),"genomic","*fa.gz")
all_files = glob.glob(match)
genome = {}
for fname in all_files:
m = re.search(".*.chromosome\.(\w+)\.fa.gz",fname)
if not m:
sys.exit("Illegal file "+fname)
chrom = m.group(1)
if chrom not in chroms: continue
print(chrom,end="\t")
seq = SeqIO.read(gzip.open(fname,"rt"),"fasta")
genome[chrom]=seq
print()
return genome


def getData(directory, strand_fn, manifest_fn):
genome = [None]*40

for build in [36,37]:
print("Reading in build ",build)
genome[build] = getRef(directory,build)

mf = pd.read_csv(manifest_fn,delimiter=",",dtype={"Chr":str}, skiprows=7)
strand = \
pd.read_csv(strand_fn,delim_whitespace=True,dtype={"Chr":str}, comment="#")
return (genome, mf, strand)


comp = {'A':'T','C':'G','G':'C','T':'A'}


def rc(seq):
m = ""
for x in seq:
m = comp.get(x,x)+m
return m


def match(ref_left,probe_left,ref_right,probe_right,RC,dist):
if RC:
tmp = probe_left
probe_left = rc(probe_right)
probe_right = rc(tmp)
pl_len = len(probe_left)
pl_rgt = len(probe_right)
pleft = probe_left[-min(seg_len,pl_len):]
pright = probe_right[:min(seg_len,pl_rgt)]
rleft = ref_left[-min(seg_len,pl_len):].seq
rright = ref_right[:min(seg_len,pl_rgt)].seq
try:
d1 = dist(rleft,pleft)
d2 = dist(rright,pright)
except ValueError:
print("\nString length mismatch error,i,",snp["SNP_Name"])
print("LEFT: ",rleft,len(rleft),pleft,len(pleft),"RC=",RC)
print("RIGHT: ",rright,len(rright),pright,len(pright))
return 50
return d1+d2


def warn(warnf,chrom_num,coord,snp,direc,score):
if score>4:
warnf.write(TAB.join(map(str,["WARN:",chrom_num,coord,snp,direc,score]))+EOL)

def alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,dist=distance.hamming):
seq = chromosome[coord-seg_len:coord+seg_len+1]
base = seq.seq[seg_len]
ref_pre = seq[0:seg_len]
ref_post = seq[seg_len+1:]
fwd = match(ref_pre,probe_pre,ref_post,probe_pst,False,dist)
rev = 2000
score = 1000
align = "-"
if fwd<10:
align="fwd"
score=fwd
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"+",fwd)
else:
rev=match(ref_pre,probe_pre,ref_post,probe_pst,True,dist)
if rev<10:
align="rev"
score=rev
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"-",rev)
if the_snp in [ "[D/I]", "[I/D]"]:
align="fwd"
base ="I"
return align, base, score, fwd, rev


args = parseArguments()
if args.chrom_only:
chroms = [ args.chrom_only ]
else:
chroms = list(map(str, range(1,23)))+['X','Y','MT']

(genome,mf,strand) = getData(args.reference_genome_dir,args.strand_report,args.chip_manifest)




g = open(args.output+".ref","w")
warnf = open(args.output+".wrn","w")
errf = open(args.output+".err","w")
seg_len = args.seg_len
for i,snp in strand.iterrows():
align = "fwd"
base = snp["Top_AlleleA"] # if can't do better
score = -1
coord = snp["Coord"]-1
build = int(snp['Build'])
snp_name = snp["SNP_Name"]
the_snp = mf.loc[i]["SNP"]
chrom_num = snp["Chr"]
if genome[build] == None:
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"build err"]))+EOL
errf.write(output)
output = TAB.join(map(str, [snp["SNP_Name"],chrom_num,coord+1,base,align]))+EOL
g.write(output)
continue
if chrom_num not in genome[build]:
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"chrom_num_err"]))+EOL
errf.write(output)
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
g.write(output)
continue
top_seq = snp["Top_Seq"].upper()
top_pre = top_seq.index("[")
top_pst = top_seq.index("]")+1
alleles = top_seq[top_pre+1:top_pst-1].split("/")
probe_pre = top_seq[:top_pre]
probe_pst = top_seq[top_pst:]
chromosome = genome[build]["X" if chrom_num == "XY" else chrom_num]
align, base, score, fwd, rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst)
if score==1000:
align, base, score, fwd,rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,distance.levenshtein)
if score<10:
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"BM",min(fwd,rev))
else:
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"non_align",fwd,rev]))+EOL
errf.write(output)
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
g.write(output)
g.close()
errf.close()
warnf.close()

80 changes: 80 additions & 0 deletions aux/updateFam.py
@@ -0,0 +1,80 @@
#!/usr/bin/env python3

import argparse
import pandas as pd
import sys
import re

def parseArguments():
parser=argparse.ArgumentParser()
parser.add_argument('samplesheet', type=str, metavar='samplesheet'),
parser.add_argument('updatesheet', type=str, metavar='updatesheet'),
parser.add_argument('oldfam', type=str, metavar='oldfam'),
parser.add_argument('output', type=str, metavar='output'),
args = parser.parse_args()
if "%s.fam"%args.output == args.oldfam:
sys.exit("New and old fam cannot be the same")
return args

EOL=chr(10)
PIDS = ['FID','IID']

def getFam(famf):
fam = pd.read_csv(famf,delim_whitespace=True,header=None,\
names=["FID","IID","FAT","MAT","SEX","PHE"])
old = ['OFID','OIID']
fam['OFID']=fam['FID']
fam['OIID']=fam['IID']
fam.set_index(old, inplace=True)
oldfam = fam.copy(deep=True)
return oldfam,fam

def getSmplLbl(data):
m=re.search(".*_(\w+)",data["Institute Sample Label"])
if not m: sys.exit("Failed parsing "+data)
return m.group(1)



args = parseArguments()
oldfam,fam = getFam(args.oldfam)

orig = pd.read_excel(args.samplesheet)
orig.set_index(["Institute Plate Label","Well"],inplace=True)
orig['PID']=orig.apply(getSmplLbl,axis=1)
update = pd.read_excel(args.updatesheet,skiprows=3)
count=0
g=open("%s.errs"%args.output,"w")
h=open("%s.switch"%args.output,"w")

for i, row in update.iterrows():
if "IlluminaControl" in row["Institute Sample Label"]: continue
new_lbl=getSmplLbl(row)
pos = tuple(row[['Institute Plate Label','Well']].values)
oldid = getSmplLbl(orig.loc[pos])
if oldid != new_lbl:
h.write("%s -> %s \n"%(oldid,new_lbl))
count=count+1
if fam.index.contains((new_lbl,new_lbl)):
fam.loc[(oldid,oldid),["FID","IID"]] = new_lbl
fam.loc[(oldid,oldid),["FAT","MAT","SEX","PHE"]] = oldfam.loc[(new_lbl,new_lbl),["FAT","MAT","SEX", "PHE"]]
else:
g.write(new_lbl+EOL)
continue
if "unknown" in row["Institute Sample Label"]:
print("Pos=<%s>; Old =<%s>; New=<%s>"%(pos,oldid,new_lbl))
print(fam.loc[(oldid,oldid)]['IID'])


g.close()
h.close()
all_ids = fam[PIDS].to_records(index=False).tolist()
uniq =set([])
orig.set_index('PID',inplace=True)
for x in all_ids:
if x in uniq:
print("Duplicated element ",x[0],orig.loc[x[0],"Institute Sample Label"])
else:
uniq.add(x)

fam.to_csv("%s.fam"%args.output,sep="\t",header=None,index=False),
52 changes: 52 additions & 0 deletions bin/qc1logextract.py
@@ -0,0 +1,52 @@
#!/usr/bin/env python3

from __future__ import print_function


import sys
import re

f = open(sys.argv[1])
rem_name=sys.argv[2]
rem_inds = rem_snps = rem_maf = rem_hwe = 0

autosome=False
nonautosome = 0
for line in f:
if "Options in effect" in line:
autosome=False
continue
if "--autosome" in line:
autosome=True
continue
m =re.search("(\w+) out of (\w+) variants loaded from .bim file.", line)
if m and autosome:
nonautosome = int(m.group(2))-int(m.group(1))
continue
m=re.search("(\w+) (\w+) removed due to ([-A-z0-9]+ \w+)",line)
if m:
#print(m.group(1),m.group(2),m.group(3))
n = int(m.group(1))
if m.group(2)=="variants":
if m.group(3)=="missing genotype":
rem_snps = rem_snps+n
elif m.group(3)=="Hardy-Weinberg exact":
rem_hwe = rem_hwe+n
else:
rem_maf = rem_maf + n
else:
rem_inds = rem_inds + n

text = """
*-noindent
Using this approach,
*-begin{itemize}
*-item %d SNPs that are non-autosomal were removed;
*-item %d SNPs were removed due missing genotype threshold constraints;
*-item %d individuals were removed due to missing genotype constraints (the list of missing individuals, if any, can be found in the file *-url{%s});
*-item %d SNPs were removed as the MAF was too low.
*-item %d SNPs were removed as they were out of the specified Hardy-Weinberg equilibrium.
*-end{itemize}
""" %(nonautosome,rem_snps,rem_inds,rem_name,rem_maf,rem_hwe)

print(text.replace("*-",chr(92)).replace("##",chr(36)))

0 comments on commit 88fdcea

Please sign in to comment.