In [1]:
import ipyrad as ip
import numpy as np
import pandas as pd

**Step 1 (assembly):** load fastq data from other notebook.

In [2]:
# Retrieve fastq files from their location on moto.
hipp1 = ip.Assembly("hipp1")
hipp1.set_params("sorted_fastq_path", "/moto/eaton/users/hnl2109/hipp-rad-files-2/*.fastq.gz")

# Save results to a new folder on moto.
hipp1.set_params("project_dir", "/moto/eaton/users/hnl2109/analysis-ipyrad")
hipp1.run("1", force=True, auto=True)

New Assembly: hipp1
Parallel connection | t052: 24 cores
[####################] 100% 0:02:04 | loading reads        | s1 |
Parallel connection closed.


**Step 2 (filtering):** trim adapters and remove poor reads.

In [3]:
# Filter reads to remove adapters and low-quality sequence.
hipp1.params.phred_Qscore_offset = 43
hipp1.params.filter_adapters = 3
hipp1.run("2", force=True, auto=True)

Parallel connection | t052: 24 cores
[####################] 100% 0:37:55 | processing reads     | s2 |
Parallel connection closed.


In [12]:
len(hipp1.stats)

260

**Steps 3-7:** branching and assembling based on reference sequence.

In [5]:
# Create a reference branch, then specify an assembly from a reference sequence.  I will call this branch "basic" in
# anticipation of potentially creating more branches later, with more parameter tweaking.
reference = hipp1.branch("robur_ref_basic")
reference.params.assembly_method = "reference"

# The reference sequence here is the Quercus robur genome, saved to the genome-file folder.
reference.params.reference_sequence = "/moto/eaton/users/hnl2109/genome-file/Qrob_PM1N.fa"

# Set other parameters and review them.
reference.params.max_SNPs_locus = 0.25
reference.params.mindepth_majrule = 1
reference.params

0   assembly_name               robur_ref_basic                              
1   project_dir                 /moto/eaton/users/hnl2109/analysis-ipyrad    
2   raw_fastq_path                                                           
3   barcodes_path                                                            
4   sorted_fastq_path           /moto/eaton/users/hnl2109/hipp-rad-files-2/*.fastq.gz
5   assembly_method             reference                                    
6   reference_sequence          /moto/eaton/users/hnl2109/genome-file/Qrob_PM1N.fa
7   datatype                    rad                                          
8   restriction_overhang        ('TGCAG', '')                                
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         43                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            1                  

In [6]:
# Run steps 3-7 to perform the assembly, visualize the stats.
reference.run("34567", auto=True, force=True)
reference.stats

Parallel connection | t052: 24 cores
[####################] 100% 0:00:06 | indexing reference   | s3 |
[####################] 100% 0:01:51 | dereplicating        | s3 |
[####################] 100% 0:18:45 | mapping reads        | s3 |
[####################] 100% 0:12:34 | building clusters    | s3 |
[####################] 100% 0:00:30 | calc cluster stats   | s3 |
[####################] 100% 0:03:46 | inferring [H, E]     | s4 |
[####################] 100% 0:00:28 | calculating depths   | s5 |
[####################] 100% 0:00:38 | chunking clusters    | s5 |
[####################] 100% 1:25:15 | consens calling      | s5 |
[####################] 100% 0:02:48 | indexing alleles     | s5 |
[####################] 100% 0:01:28 | concatenating bams   | s6 |
[####################] 100% 0:00:28 | fetching regions     | s6 |
[####################] 100% 0:02:02 | building database    | s6 |
Encountered an Error.
Message: ValueError: cannot copy sequence with size 150 to array axis with dimensio

Unnamed: 0,state,reads_raw,reads_passed_filter,refseq_mapped_reads,refseq_unmapped_reads,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
SRR1915525,5,5352627,5152665,4792028,360637,64721,64721,0.006416,0.000623,62796
SRR1915528,5,3742953,3599891,3192825,407066,65692,65692,0.008958,0.000809,63350
SRR1915533,5,3219505,3058655,1889915,1168740,52946,52946,0.006185,0.000676,51679
SRR1915534,5,2826213,2738381,2518537,219844,58846,58846,0.006469,0.00069,57293
SRR1915539,5,969575,941958,879955,62003,51733,51733,0.005982,0.000532,50757
SRR1915542,5,1012884,980710,857401,123309,49028,49028,0.005274,0.000496,48120
SRR1915547,5,1130911,1098457,1018469,79988,53063,53063,0.007035,0.000557,51913
SRR1915548,5,347949,337591,240913,96678,37598,37598,0.006283,0.000581,37230
SRR1915549,5,975393,948652,877906,70746,50991,50991,0.006512,0.000571,50002
SRR1915552,5,905151,879511,827704,51807,55595,55595,0.008612,0.000658,54347


In [3]:
load.run("67", auto=True, force=True)
load.stats

Parallel connection | t103: 24 cores
[####################] 100% 0:01:31 | concatenating bams   | s6 |
[####################] 100% 0:00:32 | fetching regions     | s6 |
[####################] 100% 0:01:19 | building database    | s6 |
[####################] 100% 0:01:14 | applying filters     | s7 |
[####################] 100% 0:23:12 | building arrays      | s7 |
[####################] 100% 0:10:38 | writing conversions  | s7 |
Parallel connection closed.


Unnamed: 0,state,reads_raw,reads_passed_filter,refseq_mapped_reads,refseq_unmapped_reads,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
SRR1915525,6,5352627,5152665,4792028,360637,64721,64721,0.006416,0.000623,62796
SRR1915528,6,3742953,3599891,3192825,407066,65692,65692,0.008958,0.000809,63350
SRR1915533,6,3219505,3058655,1889915,1168740,52946,52946,0.006185,0.000676,51679
SRR1915534,6,2826213,2738381,2518537,219844,58846,58846,0.006469,0.00069,57293
SRR1915539,6,969575,941958,879955,62003,51733,51733,0.005982,0.000532,50757
SRR1915542,6,1012884,980710,857401,123309,49028,49028,0.005274,0.000496,48120
SRR1915547,6,1130911,1098457,1018469,79988,53063,53063,0.007035,0.000557,51913
SRR1915548,6,347949,337591,240913,96678,37598,37598,0.006283,0.000581,37230
SRR1915549,6,975393,948652,877906,70746,50991,50991,0.006512,0.000571,50002
SRR1915552,6,905151,879511,827704,51807,55595,55595,0.008612,0.000658,54347


In [2]:
load = ip.load_json("/moto/eaton/users/hnl2109/analysis-ipyrad/robur_ref_basic.json")

loading Assembly: robur_ref_basic
from saved path: /moto/eaton/users/hnl2109/analysis-ipyrad/robur_ref_basic.json


In [4]:
load.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,refseq_mapped_reads,refseq_unmapped_reads,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
SRR1915525,6,5352627,5152665,4792028,360637,64721,64721,0.006416,0.000623,62796
SRR1915528,6,3742953,3599891,3192825,407066,65692,65692,0.008958,0.000809,63350
SRR1915533,6,3219505,3058655,1889915,1168740,52946,52946,0.006185,0.000676,51679
SRR1915534,6,2826213,2738381,2518537,219844,58846,58846,0.006469,0.00069,57293
SRR1915539,6,969575,941958,879955,62003,51733,51733,0.005982,0.000532,50757
SRR1915542,6,1012884,980710,857401,123309,49028,49028,0.005274,0.000496,48120
SRR1915547,6,1130911,1098457,1018469,79988,53063,53063,0.007035,0.000557,51913
SRR1915548,6,347949,337591,240913,96678,37598,37598,0.006283,0.000581,37230
SRR1915549,6,975393,948652,877906,70746,50991,50991,0.006512,0.000571,50002
SRR1915552,6,905151,879511,827704,51807,55595,55595,0.008612,0.000658,54347


In [2]:
load = ip.load_json("/moto/eaton/users/hnl2109/analysis-ipyrad/robur_ref_basic.json")

loading Assembly: robur_ref_basic
from saved path: /moto/eaton/users/hnl2109/analysis-ipyrad/robur_ref_basic.json


In [4]:
# Changing the output to also produce a nexus file, for potential implementation in mrbayes.
load.params.output_formats = ['p', 's', 'l', 'n']
load.params

0   assembly_name               robur_ref_basic                              
1   project_dir                 /moto/eaton/users/hnl2109/analysis-ipyrad    
2   raw_fastq_path                                                           
3   barcodes_path                                                            
4   sorted_fastq_path           /moto/eaton/users/hnl2109/hipp-rad-files-2/*.fastq.gz
5   assembly_method             reference                                    
6   reference_sequence          /moto/eaton/users/hnl2109/genome-file/Qrob_PM1N.fa
7   datatype                    rad                                          
8   restriction_overhang        ('TGCAG', '')                                
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         43                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            1                  

In [5]:
load.run("7", auto=True, force=True)
load.stats

Parallel connection | t108: 24 cores
[####################] 100% 0:00:43 | applying filters     | s7 |
[####################] 100% 0:23:02 | building arrays      | s7 |
[####################] 100% 0:10:57 | writing conversions  | s7 |
Parallel connection closed.


Unnamed: 0,state,reads_raw,reads_passed_filter,refseq_mapped_reads,refseq_unmapped_reads,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
SRR1915525,6,5352627,5152665,4792028,360637,64721,64721,0.006416,0.000623,62796
SRR1915528,6,3742953,3599891,3192825,407066,65692,65692,0.008958,0.000809,63350
SRR1915533,6,3219505,3058655,1889915,1168740,52946,52946,0.006185,0.000676,51679
SRR1915534,6,2826213,2738381,2518537,219844,58846,58846,0.006469,0.00069,57293
SRR1915539,6,969575,941958,879955,62003,51733,51733,0.005982,0.000532,50757
SRR1915542,6,1012884,980710,857401,123309,49028,49028,0.005274,0.000496,48120
SRR1915547,6,1130911,1098457,1018469,79988,53063,53063,0.007035,0.000557,51913
SRR1915548,6,347949,337591,240913,96678,37598,37598,0.006283,0.000581,37230
SRR1915549,6,975393,948652,877906,70746,50991,50991,0.006512,0.000571,50002
SRR1915552,6,905151,879511,827704,51807,55595,55595,0.008612,0.000658,54347
