Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
347 lines (284 sloc) 12.3 KB
#! /usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use File::Spec qw(splitpath);
use DBI;
my $db_dir = ".";
my $db_name = "eforge_1.2.db";
my $array_tag = "27k";
my $array_name = "Illumina Human 27k array (subset of 450k only)";
my $species = "Homo sapiens";
my $proxy_threshold = 1000;
my $illumina27k_bpm_file = 'ftp://webdata:webdata@ussd-ftp.illumina.com/downloads/ProductFiles/HumanMethylation27/humanmethylation27_270596_v1-2.bpm';
my $illumina450k_csv_file = 'ftp://webdata2:webdata2@ussd-ftp.illumina.com/downloads/ProductFiles/HumanMethylation450/HumanMethylation450_15017482_v1-2.csv';
my $work_dir = ".";
my $bedtools = "bedtools";
my $help;
my $desc = qq{load_27k_array.pl [options]
DESCRIPTION:
This script loads the subset of probe locations and annotations of the Illumina Human 27k
methylation array that are shared with the 450k array into the eFORGE database.
Note that you *must* load the arrays before loading the datasets. If you want to include new arrays
at a later date, you will have to reload all the datasets again (i.e. you will have to re-start from
scratch).
Optional arguments:
--db_name <name>
is the name of the SQLite file [def: $db_name]
--db_dir <path>
is the location of the SQLite file [def: $db_dir]
--work_dir <path>
is the location where temporary files will be downloaded/created [def: $work_dir]
--bedtools <name>
is the name of the bedtools executable you want to use [def: $bedtools]
};
GetOptions(
"help" => \$help,
"db_name=s" => \$db_name,
"db_dir=s" => \$db_dir,
"work_dir=s" => \$work_dir,
"bedtools=s" => \$bedtools,
);
my $dsn = "dbi:SQLite:dbname=$db_dir/$db_name";
my $dbh = DBI->connect($dsn, "", "") or die $DBI::errstr;
my ($vol, $path, $file) = File::Spec->splitpath($illumina450k_csv_file);
if (!-e "$work_dir/$file") {
download_url($illumina450k_csv_file, $work_dir);
}
my $array450k_info = parse_450k_file("$work_dir/$file");
my ($vol27k, $path27k, $file27k) = File::Spec->splitpath($illumina27k_bpm_file);
if (!-e "$work_dir/$file27k") {
download_url($illumina27k_bpm_file, $work_dir);
}
my $array27k_info = parse_27k_file("$work_dir/$file27k");
my $array_info = subset_array($array450k_info, $array27k_info);
load_array($dbh, $db_name, $species, $array_tag, $array_name, $array_info);
load_proxy_filter($dbh, $db_name, $array_tag, "GRCh37", $work_dir, $proxy_threshold);
exit();
sub download_url {
my ($url, $work_dir) = @_;
print "Downloading $url...\n";
system("wget -q -N -P $work_dir $url");
}
sub parse_450k_file {
my ($illumina450k_csv_file) = @_;
my $array;
open(CSV, $illumina450k_csv_file) or die;
my $annotation;
my @gene_annotations = ("TSS200", "TSS1500", "1stExon", "Body", "3'UTR", "5'UTR", "IGR");
my $cpg_annotations = {
'N_Shelf' => "Shelf_Shore",
'S_Shelf' => "Shelf_Shore",
'N_Shore' => "Shelf_Shore",
'S_Shore' => "Shelf_Shore",
'Island' => "Island"};
while (<CSV>) {
chomp;
my @data = split(",", $_);
my $probe_id = $data[0];
next if ($probe_id !~ /^cg/ and $probe_id !~ /^ch\./);
my $probe_chr37 = $data[11];
my $probe_loc37 = $data[12];
my $probe_chr36 = $data[14];
my $probe_loc36 = $data[15];
my $this_gene_annotation_arrayStr = $data[23];
my $this_cpg_annotation = $data[25]?$cpg_annotations->{$data[25]}:"NA";
my $this_gene_annotation;
if (!defined($this_gene_annotation_arrayStr) or $this_gene_annotation_arrayStr eq "") {
$this_gene_annotation = "IGR";
} else {
my @this_gene_annotations = split(";", $this_gene_annotation_arrayStr);
foreach my $this_a (@this_gene_annotations) {
if (grep {$_ eq $this_a} @gene_annotations) {
$this_gene_annotation = $this_a;
last;
}
}
}
$annotation->{$this_gene_annotation."-".$this_cpg_annotation}++;
$array->{$probe_id} = [$probe_chr36, $probe_loc36, $probe_chr37, $probe_loc37, $this_gene_annotation, $this_cpg_annotation];
}
close(CSV);
# foreach my $this_a (keys %$annotation) { #@gene_annotations) {
# print $annotation->{$this_a}, "\t", $this_a, "\n";
# }
return $array;
}
sub parse_27k_file {
my ($illumina27k_bpm_file) = @_;
my $array;
open(CSV, $illumina27k_bpm_file) or die;
my $block = "";
while (<CSV>) {
if (/^\[(\w+)\]/) {
$block = $1;
}
next if ($block ne "Assay");
chomp;
my @data = split(",", $_);
my $probe_id = $data[0];
my $name = $data[1];
my $probe_chr36 = $data[8];
my $probe_loc36 = $data[9];
if ($probe_id !~ /^cg/ and $probe_id !~ /^ch\./) {
# print "Skipping $probe_id\n";
next;
}
if ($probe_id ne $name) {
print "ERROR: $probe_id ne $name\n";
}
$array->{$probe_id} = [$probe_chr36, $probe_loc36];
}
close(CSV);
return $array;
}
sub subset_array {
my ($full_array, $sub_array) = @_;
my $array;
foreach my $probe_id (keys %$full_array) {
if ($sub_array->{$probe_id}) {
$array->{$probe_id} = $full_array->{$probe_id};
print "MISMATCH $probe_id\n" if (($full_array->{$probe_id}->[0] ne $sub_array->{$probe_id}->[0])
or
($full_array->{$probe_id}->[1] != $sub_array->{$probe_id}->[1]));
}
}
return $array;
}
sub load_array_1 {
my ($dbh, $code_name, $array_info) = @_;
my $table_name = $code_name;
$table_name =~ s/\W//g;
$table_name = "array_$table_name";
$dbh->do("CREATE TABLE IF NOT EXISTS $table_name (probe_id, location, gene_group, cgi_group)");
my $sth = $dbh->prepare("INSERT INTO array_$code_name VALUES (?, ?, ?, ?)");
foreach my $this_probe_id (sort keys $array_info) {
my ($chr, $loc, $gene_group, $cgi_group) = @{$array_info->{$this_probe_id}};
my $location = "$chr:$loc-$loc";
$sth->execute($this_probe_id, $location, $gene_group, $cgi_group);
}
$sth->finish();
return $table_name;
}
sub load_array {
my ($dbh, $db_name, $species_name, $array_tag, $array_name, $array_info) = @_;
my $sth;
$sth = $dbh->prepare("INSERT OR IGNORE INTO array (array_tag, array_name, species_name) VALUES (?, ?, ?)");
$sth->execute($array_tag, $array_name, $species_name);
my $array_id = $dbh->last_insert_id("", "", "", "");
$sth->finish();
if ($array_id == 0) {
$array_id = $dbh->selectrow_array("SELECT array_id FROM array WHERE array_name = '$array_name' AND species_name = 'Homo sapiens'");
}
$sth = $dbh->prepare("INSERT OR IGNORE INTO assembly (species_name, assembly_name) VALUES (?, ?)");
$sth->execute($species_name, "GRCh37");
$sth->execute($species_name, "NCBI36");
$sth->finish();
my $human36_assembly_id = $dbh->selectrow_array("SELECT assembly_id FROM assembly WHERE species_name = 'Homo sapiens' AND assembly_name = 'NCBI36'");
my $human37_assembly_id = $dbh->selectrow_array("SELECT assembly_id FROM assembly WHERE species_name = 'Homo sapiens' AND assembly_name = 'GRCh37'");
$sth = $dbh->prepare("INSERT OR IGNORE INTO probe_mapping_info (array_id, assembly_id, url) VALUES (?, ?, ?)");
$sth->execute($array_id, $human36_assembly_id, "");
my $probe_mapping_human36_id = $dbh->last_insert_id("", "", "", "");
$sth->execute($array_id, $human37_assembly_id, "");
my $probe_mapping_human37_id = $dbh->last_insert_id("", "", "", "");
$sth->finish();
$sth = $dbh->prepare("INSERT OR IGNORE INTO probe_annotation_info (array_id, gene_reference_name, cgi_reference_name, url) VALUES (?, ?, ?, ?)");
$sth->execute($array_id, "RefSeq", "UCSC CpG Islands", "");
# my $probe_annotation_id = $dbh->last_insert_id("", "", "", "");
$sth->finish();
open(CSV1, ">probe_mapping.csv") or die;
open(CSV2, ">probe_annotation.csv") or die;
# my $sth1 = $dbh->prepare("INSERT OR IGNORE INTO probe_mapping (probe_mapping_id, probe_id, chr_name, position) VALUES (?, ?, ?, ?)");
# my $sth2 = $dbh->prepare("INSERT OR IGNORE INTO probe_annotation (probe_annotation_id, probe_id, gene_group, cgi_group) VALUES (?, ?, ?, ?)");
foreach my $this_probe_id (sort keys $array_info) {
my ($chr36, $loc36, $chr37, $loc37, $gene_group, $cgi_group) = @{$array_info->{$this_probe_id}};
print CSV1 join(",", $probe_mapping_human36_id, $this_probe_id, "chr$chr36", $loc36), "\n";
print CSV1 join(",", $probe_mapping_human37_id, $this_probe_id, "chr$chr37", $loc37), "\n";
# print CSV2 join(",", $probe_annotation_id, $array_id, $this_probe_id, $gene_group, $cgi_group), "\n";
print CSV2 join(",", $array_id, $this_probe_id, $gene_group, $cgi_group), "\n";
# $sth1->execute($probe_mapping_human36_id, $this_probe_id, "chr$chr36", $loc36) if ($probe_mapping_human36_id);
# $sth1->execute($probe_mapping_human37_id, $this_probe_id, "chr$chr37", $loc37) if ($probe_mapping_human37_id);
# $sth2->execute($probe_annotation_id, $this_probe_id, $gene_group, $cgi_group) if ($probe_annotation_id);
}
close(CSV1);
close(CSV2);
system("echo '.mode csv
.import probe_mapping.csv probe_mapping
.import probe_annotation.csv probe_annotation' | sqlite3 $db_name");
# $sth1->finish();
# $sth2->finish();
}
sub load_proxy_filter {
my ($dbh, $db_name, $array_tag, $assembly_name, $work_dir, $distance_threshold) = @_;
my $array_id = $dbh->selectrow_arrayref("SELECT array_id FROM array WHERE array_tag = '$array_tag'")->[0];
my $bed_file = dump_array_bed_file($dbh, $array_id, $assembly_name, $work_dir);
my $this_output_bed_file = "$work_dir/array_${array_id}.proxy.bed";
my $runstr = "$bedtools window -w $distance_threshold -a $bed_file -b $bed_file > $this_output_bed_file";
system($runstr) == 0 or die "Error while running bedtools: $?";
my $sth = $dbh->prepare("INSERT OR IGNORE INTO proxy_filter_info (array_id, description) VALUES (?, ?)");
$sth->execute($array_id, "1kb");
open(BED, $this_output_bed_file) or die;
my $all_probes;
my $mapping_probes;
while (<BED>) {
chomp;
my ($chr1, $start1, $end1, $probe1, $chr2, $start2, $end2, $probe2) = split("\t", $_);
$all_probes->{$probe1} = 1;
if ($probe1 ne $probe2) {
$mapping_probes->{$probe1}->{$probe2} = 1;
}
}
close(BED);
open(CSV, ">proxy_filter.csv") or die;
foreach my $probe (sort keys %$all_probes) {
if (defined($mapping_probes->{$probe})) {
print CSV "$array_id,$probe,", join("|", sort keys %{$mapping_probes->{$probe}}), "\n";
} else {
print CSV "$array_id,$probe,NONE\n";
}
}
close(CSV);
system("echo '.mode csv
.import proxy_filter.csv proxy_filter' | sqlite3 $db_name");
}
sub dump_array_bed_file {
my ($dbh, $this_array_id, $assembly_name, $work_dir) = @_;
my $sql = "SELECT probe_mapping_id
FROM probe_mapping_info
JOIN assembly USING (assembly_id)
WHERE array_id = $this_array_id
AND assembly_name = '$assembly_name'";
my $probe_mapping_id = $dbh->selectrow_array($sql);
if (!$probe_mapping_id) {
die "Cannot find a mapping for array $this_array_id and assembly $assembly_name\n";
}
$sql = "SELECT probe_id, chr_name, position FROM probe_mapping WHERE probe_mapping_id = $probe_mapping_id";
my $probes = $dbh->selectall_arrayref($sql);
my $bed_file = "$work_dir/array_${this_array_id}.bed";
open(BED, ">$bed_file") or die;
foreach my $this_probe (sort _sort_probes @$probes) {
print BED join("\t", $this_probe->[1], $this_probe->[2], $this_probe->[2]+1, $this_probe->[0]), "\n";
}
close(BED);
return $bed_file;
}
sub _sort_probes {
my $chr_a = $a->[1];
my $chr_b = $b->[1];
my $loc_a = $a->[2];
my $loc_b = $b->[2];
$chr_a =~ s/chr//;
$chr_b =~ s/chr//;
if ($chr_a eq $chr_b) {
return $loc_a <=> $loc_b;
} elsif ($chr_a =~ /^\d/ and $chr_b =~ /^\d/) {
return $chr_a <=> $chr_b;
} elsif ($chr_a =~ /^\d/) {
return -1;
} elsif ($chr_b =~ /^\d/) {
return 1;
} else {
return $chr_a cmp $chr_b;
}
}
exit();