In [1]:
import os 
import pandas as pd
import sys
import glob

In [2]:
#import the gene table, which has the NLRs for all accessions 
gene_table='/global/home/users/chandlersutherland/e16/Maize_NLRome_GeneTable.txt'

In [3]:
#glob all the gff3 files across all genomes 
gff3_files=glob.glob('/global/scratch/users/chandlersutherland/e16/*/genome/*1.gff3')

In [4]:
#define a function that takes a gff3 file path as input, and outputs an all gene bed file and an NLR specific bed file. 
def bed_generator(gff_filepath, gene_table):
    #read in and cleanup gff3 file 
    gff3=pd.read_csv(gff_filepath, sep='\t', skiprows=3, header=0, lineterminator='\n')
    gff3=gff3.reset_index()
    gff3=gff3.rename(columns={gff3.columns[0]:'chr', gff3.columns[1]:'source', gff3.columns[2]:'sequence_ontology', gff3.columns[3]:'start', gff3.columns[4]:'end', gff3.columns[5]:'score', gff3.columns[6]:'strand', gff3.columns[7]:'phase', gff3.columns[8]:'attributes'})

    #get metadata
    accession=gff_filepath.split('/')[6]
    basename=gff_filepath.split('/')[8].split('.')[0]
    print(accession+ ' gff3 file loaded')
    
    #filter gff3 to just genes, and expand attributes  
    genes = gff3[gff3['sequence_ontology'] == 'gene']
    attributes=genes.iloc[:,8].str.split(';',expand=True)
    genes.loc[:,'ID']=attributes.iloc[:,0].str.split('=', expand=True)[1]
    genes.loc[:,'biotype']=attributes.iloc[:,1].str.split('=', expand=True)[1]
    genes.loc[:,'logic_name']=attributes.iloc[:,2].str.split('=', expand=True)[1]
    
    #generate a bed file for all genes 
    bed = genes.loc[:,['chr', 'start', 'end', 'ID', 'strand']]
    bed = bed.rename(columns={'chr':'chrom', 'start':'chromStart', 'end':'chromEnd', 'ID':'name', 'strand':'strand'})
    bed.to_csv('/global/scratch/users/chandlersutherland/e16/' + accession + '/genome/' + basename + '_all_gene.bed', sep='\t')
    print('all gene bed written to '+'/global/scratch/users/chandlersutherland/e16/' + accession + '/genome/' + basename + '_all_gene.bed')
    
    #Load NLR gene names
    gene=pd.read_csv(gene_table,sep = '\t')
    nlr_df=gene[gene['Ecotype'] == accession.lower()]
    nlrs=nlr_df.loc[:,'Gene'].str.split('_', expand=True).iloc[:,0].str.replace('ZM', 'Zm').str.replace('AB', 'ab').unique()
    
    #quality stats 
    match=bed['name'].isin(nlrs).sum()
    unmatch=len(nlrs)-match 
    print(str(match)+' NLRs found in gff file for '+accession+'. We are missing '+str(unmatch)+'.')
    
    #write out NLR bed file
    nlr_bed=bed[bed['name'].isin(nlrs)]
    nlr_bed.to_csv('/global/scratch/users/chandlersutherland/e16/' + accession + '/genome/' + basename + '_NLR.bed', sep='\t')
    print('NLR bed written to '+'/global/scratch/users/chandlersutherland/e16/' + accession + '/genome/' + basename + '_NLR.bed')
    
    

In [10]:
for i in gff3_files:
    bed_generator(str(i), gene_table)

B97 gff3 file loaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


all gene bed written to /global/scratch/users/chandlersutherland/e16/B97/genome/Zm-B97-REFERENCE-NAM-1_all_gene.bed
125 NLRs found in gff file for B97. We are missing 0.
NLR bed written to /global/scratch/users/chandlersutherland/e16/B97/genome/Zm-B97-REFERENCE-NAM-1_NLR.bed
Oh43 gff3 file loaded
all gene bed written to /global/scratch/users/chandlersutherland/e16/Oh43/genome/Zm-Oh43-REFERENCE-NAM-1_all_gene.bed
131 NLRs found in gff file for Oh43. We are missing 0.
NLR bed written to /global/scratch/users/chandlersutherland/e16/Oh43/genome/Zm-Oh43-REFERENCE-NAM-1_NLR.bed
Tx303 gff3 file loaded
all gene bed written to /global/scratch/users/chandlersutherland/e16/Tx303/genome/Zm-Tx303-REFERENCE-NAM-1_all_gene.bed
121 NLRs found in gff file for Tx303. We are missing 0.
NLR bed written to /global/scratch/users/chandlersutherland/e16/Tx303/genome/Zm-Tx303-REFERENCE-NAM-1_NLR.bed
CML69 gff3 file loaded
all gene bed written to /global/scratch/users/chandlersutherland/e16/CML69/genome/Zm-CML6