In [257]:
import sys
import itertools

import pandas 
from scipy.stats import kendalltau


def correlate_between_cell_types(A, B):
    tau, pvalue = kendalltau(A.df[A.df.columns[0]], 
                                  B.df[B.df.columns[0]])
    return tau, pvalue


class Microassay():
    def __str__(self):
        retstr = "Cell Line: {}".format(self.cell_line)
        ndelim = len(retstr)
        retstr = "="*ndelim + "\n" + retstr + "\n" + "="*ndelim
        retstr += "\nGene Accessions include:\n"
        retstr += "\t{0}...{1}\n".format(self.uniq_gene_accession[:2], self.uniq_gene_accession[-2:])
        retstr += "A. How many distinct Gene Accession Numbers are represented in the data set?\n"
        retstr += "\t{}\n".format(self.uniq_gene_accession_cnt)
        retstr += "Available Times: \n\t{}\n".format(self.target_columns)
        retstr += "B. Which two time points are the most highly correlated?\n"
        retstr += "\t{} ".format(self.max_correlation_pair)
        retstr += "\tKendall's Tau: {}; p-value: {}\n".format(self.max_correlation_tau,
                                                        self.max_correlation_pvalue)
        retstr += "\tAbs Sum of differences: {}\n".format(self.min_difsum)
        retstr += "="*ndelim
        retstr += "\n"
        return retstr

    
    def __repr__(self):
        print "...repr goes here..."

        
    def __init__(self, 
                 infile="data_set_HL60_U937_NB4_Jurkat.txt",
                 cell_line="HL60"):
        
        self.infile = infile
        self.cell_line = cell_line 
        
        # peek at the first 10 lines of the file to find the columns of interest
        col_idx_names = self.__interrogate_file_for_cols()
        colnames = [name for idx, name in col_idx_names]
        colindexes = [idx for idx, name in col_idx_names]
        
        # read the dataframe
        self.df = pandas.read_csv(self.infile,
                                  index_col=[0,1],
                                  usecols=[0,1]+colindexes,
                                  #names=colnames,
                                  sep='\t')
        self.correlate_times()


    def correlate_times(self):
        target_columns = [aname for aname in self.df.columns if "call" not in aname]
        self.target_columns = target_columns
        combos = itertools.combinations(target_columns, 2)
        self.corr_est = {}
        for combo in combos:
            # ToDo: filter on calls
            tau, pvalue = kendalltau(list(self.df[combo[0]].values), list(self.df[combo[1]].values)) # scipy FTW!
            diffsum = sum(abs(self.df[combo[0]] - self.df[combo[1]])) # n differences
            self.corr_est[combo] = {"tau": tau,
                                    "pvalue": pvalue,
                                    "diffsum": diffsum}
            if not hasattr(self, "max_correlation") or self.max_correlation < kt[0].correlation:
                self.max_correlation_tau = tau
                self.max_correlation_pvalue = pvalue
                self.max_correlation_pair = combo
                self.min_difsum = diffsum # FixMe: memoize and check?
            
            
        
    def __interrogate_file_for_cols(self, col_time_delim="_"):
        # peek at the data to get Gene Accession counts    
        # A. How many distinct Gene Accession Numbers are represented in the data set?
        self.df = pandas.read_csv(self.infile,
                                  usecols=[0,1],
                                  sep='\t')
        self.uniq_gene_accession = sorted(list(set(self.df["Gene Accession Number"])))
        self.uniq_gene_accession_cnt = len(self.uniq_gene_accession)
        
        # peek at the data to get column info
        self.df = pandas.read_csv(self.infile,
                                 index_col=[0,1],
                                 nrows=10,
                                 sep='\t')
        #print df.columns
        col_idx_name = []

        # find columns that include the cell line
        cell_name_in_col = [self.cell_line in acol for acol in list(self.df.columns)]
        assert len(self.df.columns) == len(cell_name_in_col)
        
        # get indexes for the found columns
        data_idxs = [i for i,_ in enumerate(cell_name_in_col) if _]
        
        col_name_idx = []
        data_names = []
        time_units = [] # sanity check
        for idxi in data_idxs:
            try:
                assert self.cell_line in self.df.columns[idxi].split(col_time_delim)[0]
            except:
                print "Encountered unexpected column delimiting"
                
            # extract names, remove superfluous cell_line and units from string
            extracted_name = self.df.columns[idxi].split(col_time_delim)[1]
            col_idx_name.append( (idxi, extracted_name) )     
            
            # next add "call" columns that follow an included column                         
            if len(self.df.columns) >= idxi+1 and "call" in self.df.columns[idxi+1]:
                col_idx_name.append((idxi+1, extracted_name + "_call"))
            # sanity check units     
            time_units.append(self.df.columns[idxi].split(col_time_delim)[2])
        try:
            assert len(set(time_units)) == 1 # sanity check
            self.time_unit = time_units[0] # add the units to the Microassay instance
        except:
            print "Encountered multiple time units in columns"

        return col_idx_name

if __name__ == "__main__":
    ma = {}
    cell_types = ["HL60", "Jurkat", "U937", "NB4"]
    for cell_type in cell_types:
        ma[cell_type] = Microassay("data_set_HL60_U937_NB4_Jurkat.txt", cell_type)
        print ma[cell_type]
        
    combos = itertools.combinations(cell_types, 2)
    cell_compare = {}
    for combo in combos:
        tau, pvalue = correlate_between_cell_types(ma[combo[0]], ma[combo[1]])
        if not hasattr(cell_compare, "max_tau") or tau > cell_compare["max_tau"]:
            cell_compare["max_tau"] = tau
            cell_compare["pvalue"] = pvalue
            cell_compare["combo"] = combo
    print cell_compare

Cell Line: HL60
Gene Accessions include:
	['BioB', 'BioC']...['Z50788_f', 'cre']
A. How many distinct Gene Accession Numbers are represented in the data set?
	7229
Available Times: 
	['HL60_0_hrs', 'HL60_0.5_hrs', 'HL60_4_hrs']
B. Which two time points are the most highly correlated?
	('HL60_0.5_hrs', 'HL60_4_hrs') 	Kendall's Tau: 0.663967741532; p-value: 0.0
	Abs Sum of differences: 280502

Cell Line: Jurkat
Gene Accessions include:
	['BioB', 'BioC']...['Z50788_f', 'cre']
A. How many distinct Gene Accession Numbers are represented in the data set?
	7229
Available Times: 
	['NB4_72_hrs', 'Jurkat_0_hrs', 'Jurkat_0.5_hrs', 'Jurkat_4_hrs']
B. Which two time points are the most highly correlated?
	('Jurkat_0.5_hrs', 'Jurkat_4_hrs') 	Kendall's Tau: 0.725199056207; p-value: 0.0
	Abs Sum of differences: 155756

Cell Line: U937
Gene Accessions include:
	['BioB', 'BioC']...['Z50788_f', 'cre']
A. How many distinct Gene Accession Numbers are represented in the data set?
	7229
Available Times: 
	[