In [1]:
%matplotlib inline
import glob

from pandas import DataFrame, read_csv

import pandas as pd

#Versioning
import sys

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 26 days



In [2]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

Python version 3.5.1 |Anaconda 2.5.0 (64-bit)| (default, Dec  7 2015, 11:16:01) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Pandas version 0.18.0


In [3]:
# Parameters
search_words = ['non-coding',
                'intergenic',
                'intron',
                'exon',
                'promoter-TSS',
                'TTS',
                '5\' UTR',
                '3\' UTR',
                'p53',
                'rpr',
                'corp']
search_words = [word.lower() for word in search_words]

search_gene = 'CDKN1A'

In [4]:
# Grab all files from data folder
files = glob.glob('data/*.anno')
genes = [read_csv(f, sep='\t') for f in files]

for gene in genes:
    # Fix the PeakID label b/c it had 'junk' in it
    gene.columns.values[0] = 'PeakID'

In [5]:
# Drop 'data/' for the filenames
files = [file.split('/')[1] for file in files]

In [6]:
# Drop all data beyond 'Gene Type'
genes = [gene.ix[:,:'Gene Type'] for gene in genes]

In [7]:
# Gene data indexed by filenames and number of row in file
# Gene data is the main DataFrame for the script
genes_data = pd.concat(genes, keys=files)

In [8]:
# Search for unique names in 'Gene Name'
gene_names_results = genes_data.groupby(level=0)['Gene Name'].value_counts()

# Rename header for easier processing
gene_names_results.index = gene_names_results.index.rename(['File Name', 'Gene Name'])
gene_names_results.name = 'Counts'

In [9]:
# Output file to gene_name_counts
gene_names_results.groupby(level=0).head().to_csv('gene_name_counts.tsv',  header=True, sep='\t')

In [10]:
# Drop extra data in 'Annotation' column
genes_data['Annotation'] = genes_data['Annotation'].apply(lambda x: str(x).split()[0])

In [11]:
# Count all of the unique annotations
gene_names = genes_data.groupby(level=0)['Annotation'].value_counts()

# Fill NaN with zero, convert float to int, and output file
gene_names.unstack(level=0).fillna(0).astype(int).to_csv('annotation_results.tsv', sep='\t')

In [12]:
# Table contains all of the rows with the gene search_gene
gene_search_results = genes_data[genes_data['Gene Name'].str.contains(search_gene, na=False)]

# Rename header for easier processing
gene_search_results.index = gene_search_results.index.rename(['File Name', 'Row Number'])

In [None]:
# Output file with searched gene name
gene_search_results.to_csv('search_{}_results.tsv'.format(search_gene), sep='\t')

In [None]:
# Takes the list of search_words and gives a table of the hits and files
# *** This takes the longest ***

# Create a boolean mask based on whether the cell contains the keyword then 'collapse' them to a single column
hits_mask = [genes_data.applymap(lambda x, word=word: word in str(x).lower()).any(axis='columns') for word in search_words]

# Apply the mask on the data table and get the size of the results as the count
# .count().iloc[:, 0] is a hack for .size()
# .size() causes werid issues
hits_results = [genes_data[mask].groupby(level=0).count().iloc[:, 0] for mask in hits_mask]

# Group the data by search_words and fill in NaN as 0
search_data = pd.DataFrame(hits_results, index=search_words).fillna(0)

# Rename header for easier processing
search_data.index = search_data.index.rename('Search Word')

In [None]:
# Output the search results to file
search_data.astype(int).to_csv('search_results.tsv', sep='\t')