In [1]:
import glob

from pandas import DataFrame, read_csv

import pandas as pd

#Versioning
import sys

In [2]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

Python version 3.5.1 (default, Mar  3 2016, 09:29:07) 
[GCC 5.3.0]
Pandas version 0.18.0


In [3]:
# Parameters
search_words = ['non-coding',
                'intergenic',
                'intron',
                'exon',
                'promoter-TSS',
                'TTS',
                '5\' UTR',
                '3\' UTR',
                'p53',
                'rpr',
                'corp']
search_words = [word.lower().split()[0] for word in search_words]

search_gene = 'CG40228'

drop_dups = False

In [4]:
# Grab all files from data folder
files = glob.glob('data/*.anno')
genes = [read_csv(f, sep='\t') for f in files]

for gene in genes:
    # Fix the PeakID label b/c it had 'junk' in it
    gene.columns.values[0] = 'PeakID'

In [5]:
# Drop 'data/' for the filenames
files = [file.split('/')[1] for file in files]

In [6]:
# Drop all data beyond 'Gene Type'
genes = [gene.ix[:,:'Gene Type'] for gene in genes]

In [7]:
# Gene data indexed by filenames and number of row in file
# Gene data is the main DataFrame for the script
genes_data = pd.concat(genes, keys=files)

In [8]:
# Checks if there are duplicated rows of data
if drop_dups:
    genes_data = genes_data.drop_duplicates()
else:
    dup_mask = genes_data.duplicated().groupby(level=0)
    if dup_mask.any().any():
        raise AssertionError('Duplicated rows!', dup_mask.any())

In [9]:
# Search for unique names in 'Gene Name'
gene_names_results = genes_data.groupby(level=0)['Gene Name'].value_counts()

# Rename header for easier processing
gene_names_results.index = gene_names_results.index.rename(['File Name', 'Gene Name'])
gene_names_results.name = 'Counts'

In [10]:
# Output file to gene_name_counts
gene_names_results.groupby(level=0).head().to_csv('gene_name_counts.tsv',  header=True, sep='\t')

In [11]:
# Drop extra data in 'Annotation' column
genes_data['Annotation'] = genes_data['Annotation'].apply(lambda x: str(x).split()[0])

In [12]:
# Count all of the unique annotations
gene_names = genes_data.groupby(level=0)['Annotation'].value_counts()

# Fill NaN with zero, convert float to int, and output file
gene_names.unstack(level=0).fillna(0).astype(int).to_csv('annotation_results.tsv', sep='\t')

In [13]:
# Table contains all of the rows with the gene search_gene
gene_search_results = genes_data[genes_data['Gene Name'].str.contains(search_gene, na=False)]

# Rename header for easier processing
gene_search_results.index = gene_search_results.index.rename(['File Name', 'Row Number'])

In [14]:
# Output file with searched gene name
gene_search_results.to_csv('search_{}_results.tsv'.format(search_gene), sep='\t')

In [15]:
# Count all unique values in annotation
search_data = pd.DataFrame(genes_data.groupby(level=0)['Annotation'].value_counts())

# Change table to wanted output
search_data.reset_index(inplace=True)
search_data.columns = ['Filename', 'Search Word', 'Hits']
search_data = search_data.pivot(index="Search Word", columns="Filename", values="Hits")
search_data = search_data.drop('nan')
search_data =search_data.fillna(0)

In [16]:
# Output the search results to file
search_data.astype(int).to_csv('search_results.tsv', sep='\t')