-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
75 lines (56 loc) · 1.65 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from biopipe import *
import os
import itertools as it
WELLDERLY_DIR = "./data/wellderly/"
OTG_DIR = "./data/1000genomes/"
def preprocess(vcf):
"""
Uses biopipe to clean VCF. Steps are as follows:
- keep only SNPs (discard complex variants)
- reset variant IDs to consistent nomenclature
- switch to using VCFtools
- quality filter
- generate new VCF
- convert VCF to plink format
- keep SNPs with call rate >90%
- keeps SNPs with maf > 5%
- create final bed file
"""
base_name = re.sub("\.vcf.*$", "", vcf)
print "Cleaning %s" % (base_name)
vcf_cleaned = VCFToolsData(vcf) \
.maf(0.05) \
.recode() \
.to_vcf() \
.snps_only() \
.reset_ids()
bed_cleaned = vcf_cleaned. \
to_plink(). \
make_bed(). \
geno(0.1). \
maf(0.05). \
make_bed(base_name + "_cleaned")
return bed_cleaned
def merge_vcfs(directory, out=None):
"""
Preprocess and merge all VCF files in a directory
"""
files = get_raw_vcf_files(directory)
cleaned = [preprocess(f) for f in files]
print "Cleaning finished\n\nBeginning merge\n"
merged = multi_outer_join(cleaned, out=out)
return merged
def get_raw_vcf_files(directory):
files = map(lambda x: directory + x, os.listdir(directory))
files = filter(lambda x: "vcf" in x, files)
files = filter(lambda x: "cleaned" not in x, files)
files = filter(lambda x: "recode" not in x, files)
return files
if __name__ == "__main__":
# clean and merge VCF files for each data set
otg = merge_vcfs(OTG_DIR, out="./data/1000genomes")
well = merge_vcfs(WELLDERLY_DIR, out="./data/wellderly")
# merge wellderly and 1000 genomes
# filter for LD
# run PCA
data = inner_join(otg, well, out="./data/merged")