In [1]:
import pickle
import random
import pandas

### Associate proteins to gene reference ids

In [6]:
%%time
protein_name_to_ref_id = {}
with open("../files/kgXref23Jan2013") as f:
    for line in f:
        geneSymbol = line.split("\t")[4]
        refSeq     = line.split("\t")[5]  
        if (geneSymbol.isspace() != True and refSeq.startswith("NM") == True):
            protein_name_to_ref_id[geneSymbol] = refSeq

CPU times: user 133 ms, sys: 4.41 ms, total: 137 ms
Wall time: 136 ms


### Associate reference ids to their gene sequence

In [7]:
%%time
ref_id_to_sequence = dict()
with open("../files/3UTR_RefSeq One_record_per_gene") as f:
    firstLine = f.readline()
    ref_seq_id = firstLine.split(" ")[0].split("refGene_")[1]
    mRNA = ""
    for line in f:
        if(line.startswith(">")):
            ref_id_to_sequence[ref_seq_id] = mRNA
            ref_seq_id = line.split(" ")[0].split("refGene_")[1]
            mRNA = ""
        else:
            mRNA += line[0:len(line)-1]

CPU times: user 851 ms, sys: 44.7 ms, total: 895 ms
Wall time: 964 ms


### Associate reference ids to their randomly shuffled gene sequences

In [8]:
%%time
ref_id_to_shuffled_sequence = dict()
window_size = 10
for ref_seq_id in ref_id_to_sequence:
    gene = ref_id_to_sequence[ref_seq_id]
    window_size = 10
    gene_windows = [gene[i:i+window_size] for i in range(0, len(gene), window_size)]
    shuffled_gene = ""
    for i, geneWindow in enumerate(gene_windows): #shuffling window
        l = list(geneWindow)
        random.shuffle(l)
        shuffled_gene += ''.join(l)
    ref_id_to_shuffled_sequence[ref_seq_id] = shuffled_gene


CPU times: user 40.8 s, sys: 46 ms, total: 40.9 s
Wall time: 40.9 s


Inpect shuffling 

In [11]:
Example_before = ref_id_to_sequence['NM_001001740']
Example_before_windows = [Example_before[i:i+window_size] for i in range(0, len(Example_before), window_size)]
Example_after = ref_id_to_shuffled_sequence['NM_001001740']
Example_after_windows = [Example_after[i:i+window_size] for i in range(0, len(Example_after), window_size)]
print(Example_before_windows[1:5])
print(Example_after_windows[1:5])

['caagtcaaat', 'tgtacttgat', 'cctgctgaaa', 'tacatctgca']
['aataacctag', 'ttaggtttca', 'gagacttcca', 'taatgcacct']


### Retreive protein names from file

In [12]:
%%time
proteinsFound = []
with open("../files/ProteinsNames.txt") as f:
    for line in f:
        if(line.strip() in protein_name_to_ref_id):
            proteinsFound.append(line.strip())


CPU times: user 3.32 ms, sys: 666 µs, total: 3.98 ms
Wall time: 3.49 ms


### Associate protein name to gene sequence

In [17]:
%%time
protein_to_sequence = dict()
protein_to_shuffled_sequence = dict()
for protein in proteinsFound:
    ref_id = protein_name_to_ref_id[protein]
    if(ref_id in ref_id_to_sequence):
        sequence = ref_id_to_sequence[ref_id]
        shuffled_sequence = ref_id_to_shuffled_sequence[ref_id]
        protein_to_sequence[protein] = sequence
        protein_to_shuffled_sequence[protein] = shuffled_sequence

CPU times: user 5.25 ms, sys: 809 µs, total: 6.05 ms
Wall time: 5.4 ms


### Creating proteins_to_sequence.plk and proteins_to_shuffled_sequences.pkl files

In [18]:
%%time
pickle.dump(protein_to_sequence, open( "../files/intermediates/proteins_to_sequences.pkl", "wb" ))
pickle.dump(protein_to_shuffled_sequence, open( "../files/intermediates/proteins_to_shuffled_sequences.pkl", "wb" ))

CPU times: user 6.08 ms, sys: 21.1 ms, total: 27.2 ms
Wall time: 51.5 ms


### Generating Distance Matrix 

In [4]:
%%time
distanceMatrix = dict()
with open("../files/correlation.tsv") as f:
    for line in f:
        protein_i = line.split("\t")[0]
        protein_j = line.split("\t")[1]
        correlation = float(line.split("\t")[2])
        if(protein_i not in distanceMatrix):
            distanceMatrix[protein_i] = {}
        distanceMatrix[protein_i][protein_j] = correlation
df = pandas.DataFrame(distanceMatrix)


CPU times: user 29.2 s, sys: 1.05 s, total: 30.2 s
Wall time: 30.4 s


In [26]:
df

Unnamed: 0,56160512,56160531,56160856,AAAS,AAK1,AAR2,AARS2,AASDH,AASS,AATF,...,ZNF91,ZNHIT2,ZRANB2,ZSCAN18,ZSCAN21,ZW10,ZWILCH,ZWINT,ZYX,ZZZ3
56160512,1.000000,-0.010408,-0.015655,-0.007176,-0.020762,-0.008966,-0.022708,-0.017792,-0.031831,0.095075,...,-0.016221,-0.012532,0.241857,-0.017769,0.117697,-0.059914,-0.019007,-0.021280,0.003775,-0.026859
56160531,-0.010408,1.000000,-0.011067,0.298369,-0.086447,-0.032424,-0.019275,-0.054314,-0.004689,-0.029570,...,-0.049517,-0.038256,-0.095938,-0.054244,-0.078840,0.092254,0.014814,0.100703,-0.083606,0.027269
56160856,-0.015655,-0.011067,1.000000,-0.024937,0.110500,-0.012350,-0.017685,-0.013856,-0.024790,-0.027054,...,-0.012632,-0.009759,-0.036588,-0.013838,-0.032217,0.031446,-0.014803,-0.016573,0.699519,-0.020918
AAAS,-0.007176,0.298369,-0.024937,1.000000,-0.087532,-0.017074,-0.050126,-0.028758,0.007796,-0.071573,...,-0.044331,-0.022629,-0.100434,-0.045524,-0.078610,0.495952,0.035077,-0.040125,-0.101689,-0.054793
AAK1,-0.020762,-0.086447,0.110500,-0.087532,1.000000,-0.023553,-0.039770,-0.031161,-0.054362,-0.060841,...,-0.028409,-0.021948,-0.071958,-0.031120,-0.072453,-0.061124,-0.022679,-0.027056,0.199705,-0.038515
AAR2,-0.008966,-0.032424,-0.012350,-0.017074,-0.023553,1.000000,-0.016624,-0.008970,-0.008013,-0.027169,...,-0.012796,0.994777,-0.022822,0.015483,-0.023432,-0.034979,0.033944,-0.012263,-0.032756,0.000215
AARS2,-0.022708,-0.019275,-0.017685,-0.050126,-0.039770,-0.016624,1.000000,-0.020099,0.863452,-0.039243,...,-0.018324,-0.014157,-0.020070,-0.020073,-0.039834,-0.069844,-0.021472,-0.024040,-0.058007,-0.030342
AASDH,-0.017792,-0.054314,-0.013856,-0.028758,-0.031161,-0.008970,-0.020099,1.000000,-0.028175,0.028712,...,0.078246,-0.011092,0.109023,0.105925,0.263043,-0.036993,-0.016824,-0.018835,-0.045450,-0.023774
AASS,-0.031831,-0.004689,-0.024790,0.007796,-0.054362,-0.008013,0.863452,-0.028175,1.000000,-0.055010,...,-0.025686,-0.019845,-0.033891,0.019116,-0.042466,0.032423,0.093913,-0.033698,-0.079070,-0.015168
AATF,0.095075,-0.029570,-0.027054,-0.071573,-0.060841,-0.027169,-0.039243,0.028712,-0.055010,1.000000,...,-0.023713,-0.021657,0.476118,-0.022384,0.386135,-0.075598,-0.032848,-0.036776,-0.088739,0.039934


### Creating distanceMatrix.pkl file

In [27]:
%%time
df.to_pickle(open( "files/intermediates/distanceMatrix.pkl", "wb" ))

CPU times: user 66.2 ms, sys: 150 ms, total: 217 ms
Wall time: 795 ms


peek into distance matrix

In [23]:
means = df.mean()
stds = df.std()
meancorr = sum(means)/len(means) #average corelation
stdCorr = sum(stds)/len(stds) #average corelation
print(meancorr)
print(stdCorr)

0.023940678911504795
0.1423947438720881


In [25]:

df = pandas.read_pickle(open( "../files/intermediates/distanceMatrix.pkl", "rb" ))
df

Unnamed: 0,56160512,56160531,56160856,AAAS,AAK1,AAR2,AARS2,AASDH,AASS,AATF,...,ZNF91,ZNHIT2,ZRANB2,ZSCAN18,ZSCAN21,ZW10,ZWILCH,ZWINT,ZYX,ZZZ3
56160512,1.000000,-0.010408,-0.015655,-0.007176,-0.020762,-0.008966,-0.022708,-0.017792,-0.031831,0.095075,...,-0.016221,-0.012532,0.241857,-0.017769,0.117697,-0.059914,-0.019007,-0.021280,0.003775,-0.026859
56160531,-0.010408,1.000000,-0.011067,0.298369,-0.086447,-0.032424,-0.019275,-0.054314,-0.004689,-0.029570,...,-0.049517,-0.038256,-0.095938,-0.054244,-0.078840,0.092254,0.014814,0.100703,-0.083606,0.027269
56160856,-0.015655,-0.011067,1.000000,-0.024937,0.110500,-0.012350,-0.017685,-0.013856,-0.024790,-0.027054,...,-0.012632,-0.009759,-0.036588,-0.013838,-0.032217,0.031446,-0.014803,-0.016573,0.699519,-0.020918
AAAS,-0.007176,0.298369,-0.024937,1.000000,-0.087532,-0.017074,-0.050126,-0.028758,0.007796,-0.071573,...,-0.044331,-0.022629,-0.100434,-0.045524,-0.078610,0.495952,0.035077,-0.040125,-0.101689,-0.054793
AAK1,-0.020762,-0.086447,0.110500,-0.087532,1.000000,-0.023553,-0.039770,-0.031161,-0.054362,-0.060841,...,-0.028409,-0.021948,-0.071958,-0.031120,-0.072453,-0.061124,-0.022679,-0.027056,0.199705,-0.038515
AAR2,-0.008966,-0.032424,-0.012350,-0.017074,-0.023553,1.000000,-0.016624,-0.008970,-0.008013,-0.027169,...,-0.012796,0.994777,-0.022822,0.015483,-0.023432,-0.034979,0.033944,-0.012263,-0.032756,0.000215
AARS2,-0.022708,-0.019275,-0.017685,-0.050126,-0.039770,-0.016624,1.000000,-0.020099,0.863452,-0.039243,...,-0.018324,-0.014157,-0.020070,-0.020073,-0.039834,-0.069844,-0.021472,-0.024040,-0.058007,-0.030342
AASDH,-0.017792,-0.054314,-0.013856,-0.028758,-0.031161,-0.008970,-0.020099,1.000000,-0.028175,0.028712,...,0.078246,-0.011092,0.109023,0.105925,0.263043,-0.036993,-0.016824,-0.018835,-0.045450,-0.023774
AASS,-0.031831,-0.004689,-0.024790,0.007796,-0.054362,-0.008013,0.863452,-0.028175,1.000000,-0.055010,...,-0.025686,-0.019845,-0.033891,0.019116,-0.042466,0.032423,0.093913,-0.033698,-0.079070,-0.015168
AATF,0.095075,-0.029570,-0.027054,-0.071573,-0.060841,-0.027169,-0.039243,0.028712,-0.055010,1.000000,...,-0.023713,-0.021657,0.476118,-0.022384,0.386135,-0.075598,-0.032848,-0.036776,-0.088739,0.039934
