## NCBI SRA data for bovis (Dec 2021)

<div class="alert alert-block alert-info"><b>Run this once to get the data.</b> </div>

In [2]:
sra = pd.read_csv('mbovis_sra.csv')
#apply filters
sra = sra[~sra.Organism.str.contains('BCG')]
sra = sra[~sra.strain.fillna('').str.contains('BCG')]
sra = sra[sra['Assay Type']=='WGS']
sra = sra[sra.geo_loc_name_country!='Ireland']
sra = sra[(sra.Instrument!='MinION') & (~sra.Instrument.str.contains('PacBio'))]
sra = sra[sra.LibraryLayout=='PAIRED']
sra = sra[sra.Bytes<7e8]
len(sra)

7492

In [3]:
#set missing country values
data={'PRJEB32192':'Ethiopia','PRJEB19799':'United Kingdom','PRJEB5830':'New Zealand','PRJEB9025': 'United Kingdom'}
for d in data:
    ind=sra[sra.BioProject==d].index
    sra.loc[ind,'geo_loc_name_country']=data[d]


In [12]:
cols = ['Run', 'BioProject', 'geo_loc_name_country', 'Center Name', 'Instrument', 'LibraryLayout', 'ReleaseDate',
        'SRA Study', 'Bytes', 'Bases', 'Collection_Date']

In [14]:
sra[cols].to_csv('mbovis_sra_filtered.csv',index=False)

In [None]:
sra['Center Name'].value_counts()

In [None]:
sra[sra.BioProject=='PRJEB9680']

In [None]:
sra.iloc[0]

In [None]:
sra.Instrument.value_counts()

In [15]:
x=sra[sra.geo_loc_name_country=='uncalculated']
x.BioProject.value_counts()

PRJNA251692    551
PRJNA270004      1
PRJNA557054      1
Name: BioProject, dtype: int64

In [None]:
sra.geo_loc_name_country.value_counts()

In [517]:
countries = list(sra.geo_loc_name_country.unique())

## sub-sample countries with many entries

In [40]:
sub1 = sra.geo_loc_name_country.value_counts()[13:-1]
sub1
selected1 = sra[sra.geo_loc_name_country.isin(sub1.index)]

common = sra.geo_loc_name_country.value_counts()[:13]
s2 = sra[sra.geo_loc_name_country.isin(common.index)]
selected2=s2.groupby('geo_loc_name_country').sample(10,random_state=2)

omit = ['SRR7693877','SRR7693912','SRR9850830','SRR5486090','SRR5486092',
        'SRR7131025','SRR5817706','SRR6865435','SRR7240482','SRR5817706',
        'ERR1815548','SRR5430748','SRR7983754','ERR4769552','SRR7983756',
        'ERR564455','SRR13986557','ERR4769489','ERR4769457','ERR4769538','ERR4769446',
        'ERR5311090','ERR4769520','SRR13986560','SRR13199734','SRR10993951',
        'SRR13986561','ERR4769458','SRR1657056','SRR13986562','SRR13986556','SRR13986559',
        'SRR13986563','SRR5486083','SRR13986568','SRR13986565','SRR5486080']
sampled = pd.concat([selected1,selected2])[cols]
sampled = sampled[~sampled.Run.isin(omit)]
sampled.to_csv('mbovis_sra_subsampled.csv',index=False)
len(selected1), len(selected2), len(sampled)


(133, 130, 233)

In [41]:
sampled

Unnamed: 0,Run,BioProject,geo_loc_name_country,Center Name,Instrument,LibraryLayout,ReleaseDate,SRA Study,Bytes,Bases,Collection_Date
109,SRR14049334,PRJNA625091,Argentina,USDA-NVSL-DBPL,Illumina MiSeq,PAIRED,2021-03-23T00:00:00Z,SRP256140,98419521.0,182605631.0,Not collected
110,SRR14049335,PRJNA625091,Argentina,USDA-NVSL-DBPL,Illumina MiSeq,PAIRED,2021-03-23T00:00:00Z,SRP256140,124169237.0,207748938.0,Not collected
111,SRR14049336,PRJNA625091,Argentina,USDA-NVSL-DBPL,Illumina MiSeq,PAIRED,2021-03-23T00:00:00Z,SRP256140,186676713.0,308165161.0,Not collected
112,SRR14049337,PRJNA625091,Argentina,USDA-NVSL-DBPL,Illumina MiSeq,PAIRED,2021-03-23T00:00:00Z,SRP256140,179402310.0,288307814.0,Not collected
113,SRR14049338,PRJNA625091,Argentina,USDA-NVSL-DBPL,Illumina MiSeq,PAIRED,2021-03-23T00:00:00Z,SRP256140,168628075.0,255189442.0,Not collected
...,...,...,...,...,...,...,...,...,...,...,...
3774,SRR7240195,PRJNA251692,uncalculated,USDA-NVSL-DBL,Illumina MiSeq,PAIRED,2018-05-31T00:00:00Z,SRP053287,266300277.0,411048310.0,not collected
3737,SRR7240158,PRJNA251692,uncalculated,USDA-NVSL-DBL,Illumina MiSeq,PAIRED,2018-05-31T00:00:00Z,SRP053287,255987455.0,385225457.0,not collected
3764,SRR7240185,PRJNA251692,uncalculated,USDA-NVSL-DBL,Illumina MiSeq,PAIRED,2018-05-31T00:00:00Z,SRP053287,281343555.0,474307908.0,not collected
3991,SRR7240412,PRJNA251692,uncalculated,USDA-NVSL-DBL,Illumina MiSeq,PAIRED,2018-05-31T00:00:00Z,SRP053287,252710836.0,413287270.0,not collected


In [43]:
#remove samples not in table from folder
files = glob.glob('/storage/btbgenie/mbovis_sra/*.fastq*')
for f in files:
    name=os.path.splitext(os.path.basename(f))[0].split('_')[0]
    #print (name)
    if name not in list(sampled.Run):
        print (name,os.path.basename(f))
        #shutil.move(f,os.path.join('/storage/btbgenie/mbovis_sra_other',os.path.basename(f)))

SRR5486080 SRR5486080_1.fastq.gz
SRR13986565 SRR13986565_2.fastq.gz
SRR5486080 SRR5486080_2.fastq.gz
SRR13986568 SRR13986568_2.fastq.gz
SRR13986568 SRR13986568_1.fastq.gz
SRR13986565 SRR13986565_1.fastq.gz


## fetch data

In [None]:
for run in sampled['Run']:
    f = glob.glob('/storage/btbgenie/mbovis_sra/%s*.fastq*' %run)
    if len(f)>0:
        continue
    cmd = 'fastq-dump --split-files -O /storage/btbgenie/mbovis_sra %s' %run
    print (run)
    print (cmd)
    subprocess.check_output(cmd, shell=True)
    cmd = 'pigz /storage/btbgenie/mbovis_sra/%s*.fastq' %run    
    subprocess.check_output(cmd, shell=True)