In [89]:
####
#
# csv
#
####
import os
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np

from multiprocessing import Process, current_process, cpu_count

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr, spearmanr
from scipy import spatial

import datetime

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def pert_info(pert_split):
    #an environment of a perterbation
    pert_env = pert_split[0]
    pert_env_split = pert_env.split("_")  

    ## a plate used for a perturbation
    # [1]
    pert_plate = pert_env_split[0]

    ## a plate used for a perturbation
    # [2]
    pert_cell_line = pert_env_split[1]

    ## a plate used for a perturbation
    # [3]
    pert_hour = pert_env_split[2]

    #a pert genral information(id, salt code, batch id, checksum)
    pert_info = pert_split[1]


    # [4]
    pert_id = "-".join(pert_info.split("-")[:2])# structurue id

    try:
        # [5]
        pert_sc = "-".join(pert_info.split("-")[2]) # salt code

        # [6]
        pert_bid = "-".join(pert_info.split("-")[2]) # batch id (company)

        # [7]
        pert_chk = "-".join(pert_info.split("-")[2]) # informatics checksum

    except IndexError:
        pert_sc = ""
        pert_bid = ""
        pert_chk = ""

    #a dose of a perterbuation
    # [8]
    pert_dose = pert_split[2]

    return [pert_plate, pert_cell_line, pert_hour, pert_dose]

def caculate_smiliarity(pert_file_origin, pert_file_target, pert_dict):
    X = np.array(pert_dict[pert_file_origin])
    Y = np.array(pert_dict[pert_file_target])
    
    #print pert_file_origin, X
    #print pert_file_target, Y
    #print
    
    cosine_value = 1 - spatial.distance.cosine(X, Y)
    pearsonr_value = pearsonr(X, Y)[0]
    spearmanr_value = spearmanr(X, Y)[0]
    
    return [cosine_value, pearsonr_value, spearmanr_value]
    

def pert_clustering(process_index, pert_files, pert_files_chunk):
    print "\n\nCurrent Process Index:", process_index
    print "Current Process Chunk Length:", len(pert_files_chunk)
    print "from %s to %s" % (pert_files_combinations.index(pert_files_chunk[0]), pert_files_combinations.index(pert_files_chunk[-1]))
    print "Current Process Name:", current_process().name
    
    print "Time Started at:", str(datetime.datetime.now())
    time_start = time.time()

    
    pert_list = []
    pert_df_columns = ['file_origin', 'file_target', 'cosine_avg', 'cosine_median', 'cosine_znorm']
    
    # calculate representative normalized-centroid vector for each drug
    pert_dict_avg = {}
    pert_dict_median = {}
    pert_dict_znorm = {}
    
    for pert_file in pert_files:
        pert_frame = pd.read_csv(os.path.join(outdir, pert_file))

        pert_vect_avg = []
        pert_vect_median = []
        pert_vect_znorm = []
        
        for pert_row in pert_frame.values[:10]:
            X = np.array(pert_row)
            X_avg = np.mean(X)
            X_median = np.median(X)
            X_std = np.std(X)
            
            z_normalized = X_avg/X_std
            
            #print pert_file, len(X)
            #print X
            #print "avg:", X_avg
            #print "median:", X_median
            #print "std:", X_std
            #print "z-normalized:", z_normalized
            #print
            
            pert_vect_avg.append(X_avg)
            pert_vect_median.append(X_median)
            pert_vect_znorm.append(z_normalized)
        
        pert_dict_avg[pert_file] = pert_vect_avg
        pert_dict_median[pert_file] = pert_vect_median
        pert_dict_znorm[pert_file] = pert_vect_znorm
    
    for pert_file in pert_files_chunk:
        pert_file_origin = pert_file[0]
        pert_file_target = pert_file[1]
               
        pert_list.append([pert_file_origin, pert_file_target, 
                          caculate_smiliarity(pert_file_origin, pert_file_target, pert_dict_avg)[0],
                          caculate_smiliarity(pert_file_origin, pert_file_target, pert_dict_median)[0],
                          caculate_smiliarity(pert_file_origin, pert_file_target, pert_dict_znorm)[0]])
    
    pert_df = pd.DataFrame(pert_list, columns=pert_df_columns)
    pert_df.to_csv( ('../results/pert_df_centroid_%s.csv'%process_index), sep='\t', encoding='utf-8')
    
    print "\n\nCurrent Process Index:", process_index
    print "Current Process Chunk Length:", len(pert_files_chunk)
    print "from %s to %s" % (pert_files_combinations.index(pert_files_chunk[0]), pert_files_combinations.index(pert_files_chunk[-1]))
    print "Current Process Name:", current_process().name
    print "Time Ended at:", str(datetime.datetime.now())
    print "Time elapsed: {} seconds".format(time.time()-time_start) 

In [90]:
####
#
# all combinations, all pairs
#
####

import time
import itertools
if __name__ == '__main__':
    outdir = '../data/pert_100/'
    #outdir = '../data/pert_EGFR/'
    pert_files = [f for f in listdir(outdir) if isfile(join(outdir, f))]
    pert_files_combinations = list(itertools.combinations(pert_files, 2))
    
    print "Number of total files:", len(pert_files)
    print "Number of total combinations:", len(pert_files_combinations) 

    
    # How many cores
    #cores = cpu_count()-1
    cores = 5
    num_chunk = len(pert_files_combinations)/cores

    for process_index, pert_files_chunk in enumerate(chunks(pert_files_combinations, num_chunk)):
        proc = Process(target=pert_clustering, args=(process_index, pert_files, pert_files_chunk,))
        proc.start()
        time.sleep(2)
    
    

Number of total files: 100
Number of total combinations: 4950


Current Process Index: 0
Current Process Chunk Length: 990
from 0 to 989
Current Process Name: Process-36
Time Started at: 2018-01-09 23:34:18.505827


Current Process Index: 1
Current Process Chunk Length: 990
from 990 to 1979
Current Process Name: Process-37
Time Started at: 2018-01-09 23:34:20.513976


Current Process Index: 0
Current Process Chunk Length: 990
from 0 to 989
Current Process Name: Process-36
Time Ended at: 2018-01-09 23:34:20.876054
Time elapsed: 2.36984801292 seconds


Current Process Index: 2
Current Process Chunk Length: 990
from 1980 to 2969
Current Process Name: Process-38
Time Started at: 2018-01-09 23:34:22.516163


Current Process Index: 1
Current Process Chunk Length: 990
from 990 to 1979
Current Process Name: Process-37
Time Ended at: 2018-01-09 23:34:23.038122
Time elapsed: 2.52367401123 seconds


Current Process Index: 3
Current Process Chunk Length: 990
from 2970 to 3959
Current Process Name:

In [93]:
all_df = pd.read_csv( '../results/pert_df_centroid_0.csv', sep='\t', encoding='utf-8' )
for index in range(cores-1):
    df = pd.read_csv( ('../results/pert_df_centroid_%s.csv'%(index+1)), sep='\t', encoding='utf-8' )
    all_df = all_df.append(df, ignore_index=True)
all_df.to_csv('../results/pert_df_centroid_pair.csv', sep='\t', encoding='utf-8')