Permalink
Browse files

Include 27k array

  • Loading branch information...
1 parent 386dbe2 commit e96ba5b360135475f3d9af21e16b617fd8b08fa0 @jherrero jherrero committed May 9, 2016
Showing with 346 additions and 0 deletions.
  1. +346 −0 database/load_27k_array.pl
View
@@ -0,0 +1,346 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+use Getopt::Long;
+use File::Spec qw(splitpath);
+use DBI;
+
+my $db_dir = ".";
+my $db_name = "eforge_1.2.db";
+my $array_tag = "27k";
+my $array_name = "Illumina Human 27k array (subset of 450k only)";
+my $species = "Homo sapiens";
+my $proxy_threshold = 1000;
+my $illumina27k_bpm_file = 'ftp://webdata:webdata@ussd-ftp.illumina.com/downloads/ProductFiles/HumanMethylation27/humanmethylation27_270596_v1-2.bpm';
+my $illumina450k_csv_file = 'ftp://webdata2:webdata2@ussd-ftp.illumina.com/downloads/ProductFiles/HumanMethylation450/HumanMethylation450_15017482_v1-2.csv';
+my $work_dir = ".";
+my $bedtools = "bedtools";
+
+my $help;
+
+my $desc = qq{load_27k_array.pl [options]
+
+DESCRIPTION:
+
+This script loads the subset of probe locations and annotations of the Illumina Human 27k
+methylation array that are shared with the 450k array into the eFORGE database.
+
+Note that you *must* load the arrays before loading the datasets. If you want to include new arrays
+at a later date, you will have to reload all the datasets again (i.e. you will have to re-start from
+scratch).
+
+Optional arguments:
+ --db_name <name>
+ is the name of the SQLite file [def: $db_name]
+ --db_dir <path>
+ is the location of the SQLite file [def: $db_dir]
+ --work_dir <path>
+ is the location where temporary files will be downloaded/created [def: $work_dir]
+ --bedtools <name>
+ is the name of the bedtools executable you want to use [def: $bedtools]
+};
+
+GetOptions(
+ "help" => \$help,
+ "db_name=s" => \$db_name,
+ "db_dir=s" => \$db_dir,
+ "work_dir=s" => \$work_dir,
+ "bedtools=s" => \$bedtools,
+ );
+
+my $dsn = "dbi:SQLite:dbname=$db_dir/$db_name";
+my $dbh = DBI->connect($dsn, "", "") or die $DBI::errstr;
+
+my ($vol, $path, $file) = File::Spec->splitpath($illumina450k_csv_file);
+
+if (!-e "$work_dir/$file") {
+ download_url($illumina450k_csv_file, $work_dir);
+}
+
+my $array450k_info = parse_450k_file("$work_dir/$file");
+
+my ($vol27k, $path27k, $file27k) = File::Spec->splitpath($illumina27k_bpm_file);
+
+if (!-e "$work_dir/$file27k") {
+ download_url($illumina27k_bpm_file, $work_dir);
+}
+
+my $array27k_info = parse_27k_file("$work_dir/$file27k");
+
+my $array_info = subset_array($array450k_info, $array27k_info);
+
+load_array($dbh, $db_name, $species, $array_tag, $array_name, $array_info);
+
+load_proxy_filter($dbh, $db_name, $array_tag, "GRCh37", $work_dir, $proxy_threshold);
+
+exit();
+
+
+sub download_url {
+ my ($url, $work_dir) = @_;
+
+ print "Downloading $url...\n";
+ system("wget -q -N -P $work_dir $url");
+}
+
+
+sub parse_450k_file {
+ my ($illumina450k_csv_file) = @_;
+ my $array;
+
+ open(CSV, $illumina450k_csv_file) or die;
+ my $annotation;
+ my @gene_annotations = ("TSS200", "TSS1500", "1stExon", "Body", "3'UTR", "5'UTR", "IGR");
+ my $cpg_annotations = {
+ 'N_Shelf' => "Shelf_Shore",
+ 'S_Shelf' => "Shelf_Shore",
+ 'N_Shore' => "Shelf_Shore",
+ 'S_Shore' => "Shelf_Shore",
+ 'Island' => "Island"};
+ while (<CSV>) {
+ chomp;
+ my @data = split(",", $_);
+ my $probe_id = $data[0];
+ next if ($probe_id !~ /^cg/ and $probe_id !~ /^ch\./);
+ my $probe_chr37 = $data[11];
+ my $probe_loc37 = $data[12];
+ my $probe_chr36 = $data[14];
+ my $probe_loc36 = $data[15];
+ my $this_gene_annotation_arrayStr = $data[23];
+ my $this_cpg_annotation = $data[25]?$cpg_annotations->{$data[25]}:"NA";
+
+ my $this_gene_annotation;
+ if (!defined($this_gene_annotation_arrayStr) or $this_gene_annotation_arrayStr eq "") {
+ $this_gene_annotation = "IGR";
+ } else {
+ my @this_gene_annotations = split(";", $this_gene_annotation_arrayStr);
+ foreach my $this_a (@this_gene_annotations) {
+ if (grep {$_ eq $this_a} @gene_annotations) {
+ $this_gene_annotation = $this_a;
+ last;
+ }
+ }
+ }
+
+ $annotation->{$this_gene_annotation."-".$this_cpg_annotation}++;
+
+ $array->{$probe_id} = [$probe_chr36, $probe_loc36, $probe_chr37, $probe_loc37, $this_gene_annotation, $this_cpg_annotation];
+ }
+ close(CSV);
+# foreach my $this_a (keys %$annotation) { #@gene_annotations) {
+# print $annotation->{$this_a}, "\t", $this_a, "\n";
+# }
+
+ return $array;
+}
+
+
+sub parse_27k_file {
+ my ($illumina27k_bpm_file) = @_;
+ my $array;
+
+ open(CSV, $illumina27k_bpm_file) or die;
+ my $block = "";
+ while (<CSV>) {
+ if (/^\[(\w+)\]/) {
+ $block = $1;
+ }
+ next if ($block ne "Assay");
+
+ chomp;
+ my @data = split(",", $_);
+ my $probe_id = $data[0];
+ my $name = $data[1];
+ my $probe_chr36 = $data[8];
+ my $probe_loc36 = $data[9];
+
+ if ($probe_id !~ /^cg/ and $probe_id !~ /^ch\./) {
+# print "Skipping $probe_id\n";
+ next;
+ }
+ if ($probe_id ne $name) {
+ print "ERROR: $probe_id ne $name\n";
+ }
+ $array->{$probe_id} = [$probe_chr36, $probe_loc36];
+ }
+ close(CSV);
+
+ return $array;
+}
+
+sub subset_array {
+ my ($full_array, $sub_array) = @_;
+ my $array;
+ foreach my $probe_id (keys %$full_array) {
+ if ($sub_array->{$probe_id}) {
+ $array->{$probe_id} = $full_array->{$probe_id};
+ print "MISMATCH $probe_id\n" if (($full_array->{$probe_id}->[0] ne $sub_array->{$probe_id}->[0])
+ or
+ ($full_array->{$probe_id}->[1] != $sub_array->{$probe_id}->[1]));
+ }
+ }
+ return $array;
+}
+
+
+sub load_array_1 {
+ my ($dbh, $code_name, $array_info) = @_;
+
+ my $table_name = $code_name;
+ $table_name =~ s/\W//g;
+ $table_name = "array_$table_name";
+ $dbh->do("CREATE TABLE IF NOT EXISTS $table_name (probe_id, location, gene_group, cgi_group)");
+
+ my $sth = $dbh->prepare("INSERT INTO array_$code_name VALUES (?, ?, ?, ?)");
+ foreach my $this_probe_id (sort keys $array_info) {
+ my ($chr, $loc, $gene_group, $cgi_group) = @{$array_info->{$this_probe_id}};
+ my $location = "$chr:$loc-$loc";
+ $sth->execute($this_probe_id, $location, $gene_group, $cgi_group);
+ }
+ $sth->finish();
+
+ return $table_name;
+}
+
+
+sub load_array {
+ my ($dbh, $db_name, $species_name, $array_tag, $array_name, $array_info) = @_;
+
+ my $sth;
+ $sth = $dbh->prepare("INSERT OR IGNORE INTO array (array_tag, array_name, species_name) VALUES (?, ?, ?)");
+ $sth->execute($array_tag, $array_name, $species_name);
+ my $array_id = $dbh->last_insert_id("", "", "", "");
+ $sth->finish();
+ if ($array_id == 0) {
+ $array_id = $dbh->selectrow_array("SELECT array_id FROM array WHERE array_name = '$array_name' AND species_name = 'Homo sapiens'");
+ }
+
+ $sth = $dbh->prepare("INSERT OR IGNORE INTO assembly (species_name, assembly_name) VALUES (?, ?)");
+ $sth->execute($species_name, "GRCh37");
+ $sth->execute($species_name, "NCBI36");
+ $sth->finish();
+ my $human36_assembly_id = $dbh->selectrow_array("SELECT assembly_id FROM assembly WHERE species_name = 'Homo sapiens' AND assembly_name = 'NCBI36'");
+ my $human37_assembly_id = $dbh->selectrow_array("SELECT assembly_id FROM assembly WHERE species_name = 'Homo sapiens' AND assembly_name = 'GRCh37'");
+
+ $sth = $dbh->prepare("INSERT OR IGNORE INTO probe_mapping_info (array_id, assembly_id, url) VALUES (?, ?, ?)");
+ $sth->execute($array_id, $human36_assembly_id, "");
+ my $probe_mapping_human36_id = $dbh->last_insert_id("", "", "", "");
+ $sth->execute($array_id, $human37_assembly_id, "");
+ my $probe_mapping_human37_id = $dbh->last_insert_id("", "", "", "");
+ $sth->finish();
+
+ $sth = $dbh->prepare("INSERT OR IGNORE INTO probe_annotation_info (array_id, gene_reference_name, cgi_reference_name, url) VALUES (?, ?, ?, ?)");
+ $sth->execute($array_id, "RefSeq", "UCSC CpG Islands", "");
+# my $probe_annotation_id = $dbh->last_insert_id("", "", "", "");
+ $sth->finish();
+
+ open(CSV1, ">probe_mapping.csv") or die;
+ open(CSV2, ">probe_annotation.csv") or die;
+# my $sth1 = $dbh->prepare("INSERT OR IGNORE INTO probe_mapping (probe_mapping_id, probe_id, chr_name, position) VALUES (?, ?, ?, ?)");
+# my $sth2 = $dbh->prepare("INSERT OR IGNORE INTO probe_annotation (probe_annotation_id, probe_id, gene_group, cgi_group) VALUES (?, ?, ?, ?)");
+ foreach my $this_probe_id (sort keys $array_info) {
+ my ($chr36, $loc36, $chr37, $loc37, $gene_group, $cgi_group) = @{$array_info->{$this_probe_id}};
+ print CSV1 join(",", $probe_mapping_human36_id, $this_probe_id, "chr$chr36", $loc36), "\n";
+ print CSV1 join(",", $probe_mapping_human37_id, $this_probe_id, "chr$chr37", $loc37), "\n";
+# print CSV2 join(",", $probe_annotation_id, $array_id, $this_probe_id, $gene_group, $cgi_group), "\n";
+ print CSV2 join(",", $array_id, $this_probe_id, $gene_group, $cgi_group), "\n";
+# $sth1->execute($probe_mapping_human36_id, $this_probe_id, "chr$chr36", $loc36) if ($probe_mapping_human36_id);
+# $sth1->execute($probe_mapping_human37_id, $this_probe_id, "chr$chr37", $loc37) if ($probe_mapping_human37_id);
+# $sth2->execute($probe_annotation_id, $this_probe_id, $gene_group, $cgi_group) if ($probe_annotation_id);
+ }
+ close(CSV1);
+ close(CSV2);
+ system("echo '.mode csv
+.import probe_mapping.csv probe_mapping
+.import probe_annotation.csv probe_annotation' | sqlite3 $db_name");
+# $sth1->finish();
+# $sth2->finish();
+
+}
+
+sub load_proxy_filter {
+ my ($dbh, $db_name, $array_tag, $assembly_name, $work_dir, $distance_threshold) = @_;
+
+ my $array_id = $dbh->selectrow_arrayref("SELECT array_id FROM array WHERE array_tag = '$array_tag'")->[0];
+ my $bed_file = dump_array_bed_file($dbh, $array_id, $assembly_name, $work_dir);
+ my $this_output_bed_file = "$work_dir/array_${array_id}.proxy.bed";
+ my $runstr = "$bedtools window -w $distance_threshold -a $bed_file -b $bed_file > $this_output_bed_file";
+ system($runstr) == 0 or die "Error while running bedtools: $?";
+
+ my $sth = $dbh->prepare("INSERT OR IGNORE INTO proxy_filter_info (array_id, description) VALUES (?, ?)");
+ $sth->execute($array_id, "Distance-based: 1kb");
+
+ open(BED, $this_output_bed_file) or die;
+ my $all_probes;
+ my $mapping_probes;
+ while (<BED>) {
+ chomp;
+ my ($chr1, $start1, $end1, $probe1, $chr2, $start2, $end2, $probe2) = split("\t", $_);
+ $all_probes->{$probe1} = 1;
+ if ($probe1 ne $probe2) {
+ $mapping_probes->{$probe1}->{$probe2} = 1;
+ }
+ }
+ close(BED);
+ open(CSV, ">proxy_filter.csv") or die;
+ foreach my $probe (sort keys %$all_probes) {
+ if (defined($mapping_probes->{$probe})) {
+ print CSV "$array_id,$probe,", join("|", sort keys %{$mapping_probes->{$probe}}), "\n";
+ } else {
+ print CSV "$array_id,$probe,NONE\n";
+ }
+ }
+ close(CSV);
+ system("echo '.mode csv
+.import proxy_filter.csv proxy_filter' | sqlite3 $db_name");
+
+}
+
+sub dump_array_bed_file {
+ my ($dbh, $this_array_id, $assembly_name, $work_dir) = @_;
+ my $sql = "SELECT probe_mapping_id
+ FROM probe_mapping_info
+ JOIN assembly USING (assembly_id)
+ WHERE array_id = $this_array_id
+ AND assembly_name = '$assembly_name'";
+ my $probe_mapping_id = $dbh->selectrow_array($sql);
+ if (!$probe_mapping_id) {
+ die "Cannot find a mapping for array $this_array_id and assembly $assembly_name\n";
+ }
+
+ $sql = "SELECT probe_id, chr_name, position FROM probe_mapping WHERE probe_mapping_id = $probe_mapping_id";
+ my $probes = $dbh->selectall_arrayref($sql);
+
+ my $bed_file = "$work_dir/array_${this_array_id}.bed";
+ open(BED, ">$bed_file") or die;
+ foreach my $this_probe (sort _sort_probes @$probes) {
+ print BED join("\t", $this_probe->[1], $this_probe->[2], $this_probe->[2]+1, $this_probe->[0]), "\n";
+ }
+ close(BED);
+
+ return $bed_file;
+}
+
+sub _sort_probes {
+ my $chr_a = $a->[1];
+ my $chr_b = $b->[1];
+ my $loc_a = $a->[2];
+ my $loc_b = $b->[2];
+ $chr_a =~ s/chr//;
+ $chr_b =~ s/chr//;
+ if ($chr_a eq $chr_b) {
+ return $loc_a <=> $loc_b;
+ } elsif ($chr_a =~ /^\d/ and $chr_b =~ /^\d/) {
+ return $chr_a <=> $chr_b;
+ } elsif ($chr_a =~ /^\d/) {
+ return -1;
+ } elsif ($chr_b =~ /^\d/) {
+ return 1;
+ } else {
+ return $chr_a cmp $chr_b;
+ }
+}
+
+
+exit();

0 comments on commit e96ba5b

Please sign in to comment.