In [22]:
import os

#import multiprocessing
from multiprocessing import Process, current_process, cpu_count
from os import listdir
from os.path import isfile, join
import pandas as pd
import datetime
import numpy as np
import itertools


from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr, spearmanr
from scipy import spatial

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
 
def pert_info(pert_split):
    #an environment of a perterbation
    pert_env = pert_split[0]
    pert_env_split = pert_env.split("_")  

    ## a plate used for a perturbation
    # [1]
    pert_plate = pert_env_split[0]

    ## a plate used for a perturbation
    # [2]
    pert_cell_line = pert_env_split[1]

    ## a plate used for a perturbation
    # [3]
    pert_hour = pert_env_split[2]

    #a pert genral information(id, salt code, batch id, checksum)
    pert_info = pert_split[1]


    # [4]
    pert_id = "-".join(pert_info.split("-")[:2])# structurue id

    try:
        # [5]
        pert_sc = "-".join(pert_info.split("-")[2]) # salt code

        # [6]
        pert_bid = "-".join(pert_info.split("-")[2]) # batch id (company)

        # [7]
        pert_chk = "-".join(pert_info.split("-")[2]) # informatics checksum

    except IndexError:
        pert_sc = ""
        pert_bid = ""
        pert_chk = ""

    #a dose of a perterbuation
    # [8]
    pert_dose = pert_split[2]

    return [pert_plate, pert_cell_line, pert_hour, pert_dose]

def pert_combination(process_index, pert_files_chunk):

    print "\n\nCurrent Process Index:", process_index
    print "Current Process Chunk Length:", len(pert_files_chunk)
    print "from %s to %s" % (pert_files_combinations.index(pert_files_chunk[0]), pert_files_combinations.index(pert_files_chunk[-1]))
    print "Current Process Name:", current_process().name
    
    print "Time Started at:", str(datetime.datetime.now())
    time_start = time.time()

    
    pert_list = []
    pert_df_columns = ['file_origin', 'file_target', 'condition pair count', 'total pair count', '%', 
                       'cosine(avg)', 'pearsonr(avg)', 'spearman(avg)', 'cosine(max)', 'pearsonr(max)', 'spearman(max)']
    # each process
    for pert_file in pert_files_chunk:
        pert_file_origin =  pert_file[0]
        pert_file_target = pert_file[1]

        pert_frame_origin = pd.read_csv(os.path.join(outdir, pert_file_origin))
        pert_frame_target = pd.read_csv(os.path.join(outdir, pert_file_target))

        #print index, pert_file_origin, pert_file_target

        condition_pair_count = 0
        total_pair_count = 0

        
        cosine_value_list = [0]
        pearsonr_value_list = [0]
        spearmanr_value_list = [0]
        ####################
        # pairwise comparision between origin and target
        ####################
        for pert_origin in pert_frame_origin:
            pert_split = pert_origin.split(":")
            if len(pert_split) != 3:
                continue

            pert_conditions_origin = pert_info(pert_split)

            for pert_target in pert_frame_target:
                pert_split = pert_target.split(":")
                if len(pert_split) != 3:
                    continue

                pert_conditions_target = pert_info(pert_split)

                if pert_conditions_origin == pert_conditions_target:
                    condition_pair_count += 1
                    
                    X = np.array(pert_frame_origin[pert_origin].values)#.reshape(-1,1)
                    Y = np.array(pert_frame_target[pert_target].values)#.reshape(-1,1)
                                        
                    cosine_value = 1 - spatial.distance.cosine(X, Y)
                    pearsonr_value = pearsonr(X, Y)[0]
                    spearmanr_value = spearmanr(X, Y)[0]
                    
                    cosine_value_list.append(cosine_value)
                    pearsonr_value_list.append(pearsonr_value)
                    spearmanr_value_list.append(spearmanr_value)
                    
                    #print cosine_value
                
                    
                total_pair_count += 1

        pert_list.append([pert_file_origin, pert_file_target, condition_pair_count, total_pair_count, condition_pair_count/float(total_pair_count),
                         np.mean(cosine_value_list), np.mean(pearsonr_value_list), np.mean(spearmanr_value_list),
                         np.max(cosine_value_list), np.max(pearsonr_value_list), np.max(spearmanr_value_list)])
    pert_df = pd.DataFrame(pert_list, columns=pert_df_columns)
    pert_df.to_csv( ('../results/pert_df_%s.csv'%process_index), sep='\t', encoding='utf-8')
    
    print "\n\nCurrent Process Index:", process_index
    print "Current Process Chunk Length:", len(pert_files_chunk)
    print "from %s to %s" % (pert_files_combinations.index(pert_files_chunk[0]), pert_files_combinations.index(pert_files_chunk[-1]))
    print "Current Process Name:", current_process().name
    print "Time Ended at:", str(datetime.datetime.now())
    print "Time elapsed: {} seconds".format(time.time()-time_start) 

In [24]:
####
#
# all combinations, all pairs
#
####

import time

if __name__ == '__main__':
    procs = []
    
    outdir = '../data/pert_1000/'
    #outdir = '../data/pert_EGFR/'
    pert_files = [f for f in listdir(outdir) if isfile(join(outdir, f))]
    pert_files_combinations = list(itertools.combinations(pert_files, 2))
    
    print "Number of total files:", len(pert_files)
    print "Number of total combinations:", len(pert_files_combinations) 
    
    # How many cores
    cores = cpu_count()-1
    #cores = 1
    num_chunk = len(pert_files_combinations)/cores
    
    for process_index, pert_files_chunk in enumerate(chunks(pert_files_combinations, num_chunk)):
        proc = Process(target=pert_combination, args=(process_index, pert_files_chunk,))
        proc.start()
        time.sleep(2)

Number of total files: 1000
Number of total combinations: 499500


Current Process Index: 0
Current Process Chunk Length: 26289
from 0 to 26288
Current Process Name: Process-29
Time Started at: 2018-01-08 21:16:34.911339


Current Process Index: 1
Current Process Chunk Length: 26289
from 26289 to 52577
Current Process Name: Process-30
Time Started at: 2018-01-08 21:16:36.921214


Current Process Index: 2
Current Process Chunk Length: 26289
from 52578 to 78866
Current Process Name: Process-31
Time Started at: 2018-01-08 21:16:38.935028


Current Process Index: 3
Current Process Chunk Length: 26289
from 78867 to 105155
Current Process Name: Process-32
Time Started at: 2018-01-08 21:16:40.949670


Current Process Index: 4
Current Process Chunk Length: 26289
from 105156 to 131444
Current Process Name: Process-33
Time Started at: 2018-01-08 21:16:42.958353


Current Process Index: 5
Current Process Chunk Length: 26289
from 131445 to 157733
Current Process Name: Process-34
Time Started at: 2

In [26]:
all_df = pd.read_csv( '../results/pert_df_0.csv', sep='\t', encoding='utf-8' )
for index in range(cores):
    df = pd.read_csv( ('../results/pert_df_%s.csv'%(index+1)), sep='\t', encoding='utf-8' )
    all_df = all_df.append(df, ignore_index=True)


all_df = all_df.drop('Unnamed: 0', 1)
paired_all_df = all_df.loc[all_df['condition pair count'] > 0]

print len(all_df)
print len(paired_all_df)

print len(paired_all_df)/float(len(all_df))

499500
5404
0.0108188188188


In [27]:
paired_all_df.to_csv('../results/pert_df_pair.csv', sep='\t', encoding='utf-8')