Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 88fdcea
Showing
92 changed files
with
11,169 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.git/ | ||
.nextflow* | ||
.idea* | ||
work/ | ||
output/* | ||
input/* | ||
/config-gen/nbproject/private/ | ||
/config-gen/build/ |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
|
||
|
||
import pandas as pd | ||
|
||
|
||
m = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3.csv",skiprows=7,usecols=["Name","IlmnStrand","RefStrand"],delimiter=",",dtype={"Chr":str}) | ||
#s = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3_StrandReport_FT.txt",usecols=["SNP_Name","Forward_Allele1","Top_AlleleA"],delim_whitespace=True,comment="#",dtype={"Chr":str}) | ||
|
||
xx = (m[(m['RefStrand']=="+")&(m['IlmnStrand']=="TOP")])["Name"] | ||
for snp in xx.values: | ||
print(snp) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
head -n 8 $manifest > m.csv | ||
grep $1 $manifest >> m.csv | ||
head -n 6 $strand > s.csv | ||
grep $1 $strand >> s.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
|
||
from __future__ import print_function | ||
|
||
import pandas as pd | ||
from Bio import SeqIO | ||
import glob | ||
import re | ||
import os | ||
import sys | ||
import gzip | ||
import distance | ||
import argparse | ||
|
||
|
||
TAB=chr(9) | ||
EOL=chr(10) | ||
|
||
def parseArguments(): | ||
parser=argparse.ArgumentParser() | ||
parser.add_argument('reference_genome_dir', type=str, metavar='referencegenomedir'), | ||
parser.add_argument('strand_report', type=str, metavar='strand_report'), | ||
parser.add_argument('chip_manifest', type=str, metavar='chip_manifest'), | ||
parser.add_argument('output', type=str, metavar='output',help="output report"), | ||
parser.add_argument('--chrom-only',type=str,default=False,dest="chrom_only") | ||
parser.add_argument('--seg-len', type=int, dest='seg_len', metavar='seg_len',\ | ||
default = 25, help="how much matches"), | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def getRef(path,build): | ||
match = os.path.join(path,str(build),"genomic","*fa.gz") | ||
all_files = glob.glob(match) | ||
genome = {} | ||
for fname in all_files: | ||
m = re.search(".*.chromosome\.(\w+)\.fa.gz",fname) | ||
if not m: | ||
sys.exit("Illegal file "+fname) | ||
chrom = m.group(1) | ||
if chrom not in chroms: continue | ||
print(chrom,end="\t") | ||
seq = SeqIO.read(gzip.open(fname,"rt"),"fasta") | ||
genome[chrom]=seq | ||
print() | ||
return genome | ||
|
||
|
||
def getData(directory, strand_fn, manifest_fn): | ||
genome = [None]*40 | ||
|
||
for build in [36,37]: | ||
print("Reading in build ",build) | ||
genome[build] = getRef(directory,build) | ||
|
||
mf = pd.read_csv(manifest_fn,delimiter=",",dtype={"Chr":str}, skiprows=7) | ||
strand = \ | ||
pd.read_csv(strand_fn,delim_whitespace=True,dtype={"Chr":str}, comment="#") | ||
return (genome, mf, strand) | ||
|
||
|
||
comp = {'A':'T','C':'G','G':'C','T':'A'} | ||
|
||
|
||
def rc(seq): | ||
m = "" | ||
for x in seq: | ||
m = comp.get(x,x)+m | ||
return m | ||
|
||
|
||
def match(ref_left,probe_left,ref_right,probe_right,RC,dist): | ||
if RC: | ||
tmp = probe_left | ||
probe_left = rc(probe_right) | ||
probe_right = rc(tmp) | ||
pl_len = len(probe_left) | ||
pl_rgt = len(probe_right) | ||
pleft = probe_left[-min(seg_len,pl_len):] | ||
pright = probe_right[:min(seg_len,pl_rgt)] | ||
rleft = ref_left[-min(seg_len,pl_len):].seq | ||
rright = ref_right[:min(seg_len,pl_rgt)].seq | ||
try: | ||
d1 = dist(rleft,pleft) | ||
d2 = dist(rright,pright) | ||
except ValueError: | ||
print("\nString length mismatch error,i,",snp["SNP_Name"]) | ||
print("LEFT: ",rleft,len(rleft),pleft,len(pleft),"RC=",RC) | ||
print("RIGHT: ",rright,len(rright),pright,len(pright)) | ||
return 50 | ||
return d1+d2 | ||
|
||
|
||
def warn(warnf,chrom_num,coord,snp,direc,score): | ||
if score>4: | ||
warnf.write(TAB.join(map(str,["WARN:",chrom_num,coord,snp,direc,score]))+EOL) | ||
|
||
def alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,dist=distance.hamming): | ||
seq = chromosome[coord-seg_len:coord+seg_len+1] | ||
base = seq.seq[seg_len] | ||
ref_pre = seq[0:seg_len] | ||
ref_post = seq[seg_len+1:] | ||
fwd = match(ref_pre,probe_pre,ref_post,probe_pst,False,dist) | ||
rev = 2000 | ||
score = 1000 | ||
align = "-" | ||
if fwd<10: | ||
align="fwd" | ||
score=fwd | ||
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"+",fwd) | ||
else: | ||
rev=match(ref_pre,probe_pre,ref_post,probe_pst,True,dist) | ||
if rev<10: | ||
align="rev" | ||
score=rev | ||
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"-",rev) | ||
if the_snp in [ "[D/I]", "[I/D]"]: | ||
align="fwd" | ||
base ="I" | ||
return align, base, score, fwd, rev | ||
|
||
|
||
args = parseArguments() | ||
if args.chrom_only: | ||
chroms = [ args.chrom_only ] | ||
else: | ||
chroms = list(map(str, range(1,23)))+['X','Y','MT'] | ||
|
||
(genome,mf,strand) = getData(args.reference_genome_dir,args.strand_report,args.chip_manifest) | ||
|
||
|
||
|
||
|
||
g = open(args.output+".ref","w") | ||
warnf = open(args.output+".wrn","w") | ||
errf = open(args.output+".err","w") | ||
seg_len = args.seg_len | ||
for i,snp in strand.iterrows(): | ||
align = "fwd" | ||
base = snp["Top_AlleleA"] # if can't do better | ||
score = -1 | ||
coord = snp["Coord"]-1 | ||
build = int(snp['Build']) | ||
snp_name = snp["SNP_Name"] | ||
the_snp = mf.loc[i]["SNP"] | ||
chrom_num = snp["Chr"] | ||
if genome[build] == None: | ||
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"build err"]))+EOL | ||
errf.write(output) | ||
output = TAB.join(map(str, [snp["SNP_Name"],chrom_num,coord+1,base,align]))+EOL | ||
g.write(output) | ||
continue | ||
if chrom_num not in genome[build]: | ||
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"chrom_num_err"]))+EOL | ||
errf.write(output) | ||
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL | ||
g.write(output) | ||
continue | ||
top_seq = snp["Top_Seq"].upper() | ||
top_pre = top_seq.index("[") | ||
top_pst = top_seq.index("]")+1 | ||
alleles = top_seq[top_pre+1:top_pst-1].split("/") | ||
probe_pre = top_seq[:top_pre] | ||
probe_pst = top_seq[top_pst:] | ||
chromosome = genome[build]["X" if chrom_num == "XY" else chrom_num] | ||
align, base, score, fwd, rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst) | ||
if score==1000: | ||
align, base, score, fwd,rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,distance.levenshtein) | ||
if score<10: | ||
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"BM",min(fwd,rev)) | ||
else: | ||
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"non_align",fwd,rev]))+EOL | ||
errf.write(output) | ||
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL | ||
g.write(output) | ||
g.close() | ||
errf.close() | ||
warnf.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import pandas as pd | ||
import sys | ||
import re | ||
|
||
def parseArguments(): | ||
parser=argparse.ArgumentParser() | ||
parser.add_argument('samplesheet', type=str, metavar='samplesheet'), | ||
parser.add_argument('updatesheet', type=str, metavar='updatesheet'), | ||
parser.add_argument('oldfam', type=str, metavar='oldfam'), | ||
parser.add_argument('output', type=str, metavar='output'), | ||
args = parser.parse_args() | ||
if "%s.fam"%args.output == args.oldfam: | ||
sys.exit("New and old fam cannot be the same") | ||
return args | ||
|
||
EOL=chr(10) | ||
PIDS = ['FID','IID'] | ||
|
||
def getFam(famf): | ||
fam = pd.read_csv(famf,delim_whitespace=True,header=None,\ | ||
names=["FID","IID","FAT","MAT","SEX","PHE"]) | ||
old = ['OFID','OIID'] | ||
fam['OFID']=fam['FID'] | ||
fam['OIID']=fam['IID'] | ||
fam.set_index(old, inplace=True) | ||
oldfam = fam.copy(deep=True) | ||
return oldfam,fam | ||
|
||
def getSmplLbl(data): | ||
m=re.search(".*_(\w+)",data["Institute Sample Label"]) | ||
if not m: sys.exit("Failed parsing "+data) | ||
return m.group(1) | ||
|
||
|
||
|
||
args = parseArguments() | ||
oldfam,fam = getFam(args.oldfam) | ||
|
||
orig = pd.read_excel(args.samplesheet) | ||
orig.set_index(["Institute Plate Label","Well"],inplace=True) | ||
orig['PID']=orig.apply(getSmplLbl,axis=1) | ||
update = pd.read_excel(args.updatesheet,skiprows=3) | ||
count=0 | ||
g=open("%s.errs"%args.output,"w") | ||
h=open("%s.switch"%args.output,"w") | ||
|
||
for i, row in update.iterrows(): | ||
if "IlluminaControl" in row["Institute Sample Label"]: continue | ||
new_lbl=getSmplLbl(row) | ||
pos = tuple(row[['Institute Plate Label','Well']].values) | ||
oldid = getSmplLbl(orig.loc[pos]) | ||
if oldid != new_lbl: | ||
h.write("%s -> %s \n"%(oldid,new_lbl)) | ||
count=count+1 | ||
if fam.index.contains((new_lbl,new_lbl)): | ||
fam.loc[(oldid,oldid),["FID","IID"]] = new_lbl | ||
fam.loc[(oldid,oldid),["FAT","MAT","SEX","PHE"]] = oldfam.loc[(new_lbl,new_lbl),["FAT","MAT","SEX", "PHE"]] | ||
else: | ||
g.write(new_lbl+EOL) | ||
continue | ||
if "unknown" in row["Institute Sample Label"]: | ||
print("Pos=<%s>; Old =<%s>; New=<%s>"%(pos,oldid,new_lbl)) | ||
print(fam.loc[(oldid,oldid)]['IID']) | ||
|
||
|
||
g.close() | ||
h.close() | ||
all_ids = fam[PIDS].to_records(index=False).tolist() | ||
uniq =set([]) | ||
orig.set_index('PID',inplace=True) | ||
for x in all_ids: | ||
if x in uniq: | ||
print("Duplicated element ",x[0],orig.loc[x[0],"Institute Sample Label"]) | ||
else: | ||
uniq.add(x) | ||
|
||
fam.to_csv("%s.fam"%args.output,sep="\t",header=None,index=False), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from __future__ import print_function | ||
|
||
|
||
import sys | ||
import re | ||
|
||
f = open(sys.argv[1]) | ||
rem_name=sys.argv[2] | ||
rem_inds = rem_snps = rem_maf = rem_hwe = 0 | ||
|
||
autosome=False | ||
nonautosome = 0 | ||
for line in f: | ||
if "Options in effect" in line: | ||
autosome=False | ||
continue | ||
if "--autosome" in line: | ||
autosome=True | ||
continue | ||
m =re.search("(\w+) out of (\w+) variants loaded from .bim file.", line) | ||
if m and autosome: | ||
nonautosome = int(m.group(2))-int(m.group(1)) | ||
continue | ||
m=re.search("(\w+) (\w+) removed due to ([-A-z0-9]+ \w+)",line) | ||
if m: | ||
#print(m.group(1),m.group(2),m.group(3)) | ||
n = int(m.group(1)) | ||
if m.group(2)=="variants": | ||
if m.group(3)=="missing genotype": | ||
rem_snps = rem_snps+n | ||
elif m.group(3)=="Hardy-Weinberg exact": | ||
rem_hwe = rem_hwe+n | ||
else: | ||
rem_maf = rem_maf + n | ||
else: | ||
rem_inds = rem_inds + n | ||
|
||
text = """ | ||
*-noindent | ||
Using this approach, | ||
*-begin{itemize} | ||
*-item %d SNPs that are non-autosomal were removed; | ||
*-item %d SNPs were removed due missing genotype threshold constraints; | ||
*-item %d individuals were removed due to missing genotype constraints (the list of missing individuals, if any, can be found in the file *-url{%s}); | ||
*-item %d SNPs were removed as the MAF was too low. | ||
*-item %d SNPs were removed as they were out of the specified Hardy-Weinberg equilibrium. | ||
*-end{itemize} | ||
""" %(nonautosome,rem_snps,rem_inds,rem_name,rem_maf,rem_hwe) | ||
|
||
print(text.replace("*-",chr(92)).replace("##",chr(36))) |
Oops, something went wrong.