# Notebook 1: Download raw genomic data

In [1]:
# conda install ipyrad -c ipyrad

In [9]:
import ipyrad.analysis as ipa
import requests
import gzip
import os

### Organize directory

In [3]:
# make a directory for storing raw data 
rawdir = os.path.realpath(os.path.join("../", "rawdata"))
if not os.path.exists(rawdir):
    os.makedirs(rawdir)
    
# and a subdirectory for storing RAD data
raddir = os.path.realpath(os.path.join(rawdir, "radseq"))
if not os.path.exists(raddir):
    os.makedirs(raddir)

### Download the *Quercus robur* genome

The oak genome can be downloaded from [http://www.oakgenome.fr/?page_id=587](http://www.oakgenome.fr/?page_id=587). At the time of this analysis the most recent version of the Q. robur genome is "Oak genome assembly V2_2N", which has been scaffolded into chromosomes in the version "Qrob_PM1N", which is the version we will use. 

In [4]:
# url of fasta genome file gzipped
url = "https://urgi.versailles.inra.fr/download/oak/Qrob_PM1N.fa.gz"

# get and print location to save file
refpath = os.path.realpath(os.path.join(rawdir, url.split("/")[-1]))
refpath

'/home/deren/Documents/virentes-reference/rawdata/Qrob_PM1N.fa.gz'

In [5]:
# only run if the reference doesn't already exist
if not os.path.exists(refpath):
    
    # open a stream to url and write to file 1Mb at a time.
    res = requests.get(url, stream=True)
    with open(refpath, 'wb') as out:
        for chunk in res.iter_content(chunk_size=1024*1024):
            if chunk:
                out.write(chunk)

In [10]:
# make a decompressed copy of the reference file
rname = refpath.split(".gz")[0]
if not os.path.exists(rname):
    with open(rname, 'w') as out:
        out.write(gzip.open(refpath).read().decode())

### Download demultiplexed RAD-seq data

Sequence data are from Eaton et al. 2015. The dataset includes *Quercus* ser *Virentes* sequence reads, as well as outgroup samples. The accession number used for downloading the sra files: 'SRP055977'.

In [9]:
# init sra object
sra = ipa.sratools(accession="SRP055977", workdir=raddir)

# print table of runinfo columns
df = sra.fetch_runinfo(fields=(1, 4, 11, 25, 26, 28, 29, 30))
df

Fetching project data...

Unnamed: 0,Run,spots,Experiment,Sample,BioSample,TaxID,ScientificName,SampleName
0,SRR1915524,4046890,SRX956496,SRS868426,SAMN03394519,168164,Quercus arizonica,AR_re
1,SRR1915538,931926,SRX956510,SRS874291,SAMN03394533,1628352,Quercus brandegeei,BJSB3_v
2,SRR1915525,5352627,SRX956497,SRS874263,SAMN03394520,1628352,Quercus brandegeei,BJSL25_re
3,SRR1915539,969575,SRX956511,SRS874290,SAMN03394534,1628352,Quercus brandegeei,BJSL25_v
4,SRR1915526,4715624,SRX956498,SRS874262,SAMN03394521,1628352,Quercus brandegeei,BJVL19_re
5,SRR1915540,817443,SRX956512,SRS874289,SAMN03394535,1628352,Quercus brandegeei,BJVL19_v
6,SRR1915541,849191,SRX956513,SRS868439,SAMN03394536,167433,Quercus oleoides,BZBB1_v
7,SRR1915527,4539385,SRX956499,SRS874261,SAMN03394522,97695,Quercus chrysolepis,CH_re
8,SRR1915528,3742953,SRX956500,SRS874260,SAMN03394523,167433,Quercus oleoides,CRL0001_re
9,SRR1915542,1012884,SRX956514,SRS874288,SAMN03394537,167433,Quercus oleoides,CRL0001_v


In [12]:
# connect to parallel client started with 'ipcluster start'
import ipyparallel as ipp
ipyclient = ipp.Client()

In [13]:
# download the raw data files.
sra.run(name_fields=(30,1), name_separator="_", ipyclient=ipyclient)

[####################] 100% 0:05:39 | Downloading fastq files |  |
43 fastq files downloaded to /home/deren/Documents/virentes-reference/rawdata/radseq
