In [8]:
import pickle
import random
import pandas

### Associate proteins to gene reference ids

In [18]:
%%time
protein_name_to_ref_id = {}
with open("../files/kgXref23Jan2013") as f:
    for line in f:
        geneSymbol = line.split("\t")[4]
        refSeq     = line.split("\t")[5]  
        if (geneSymbol.isspace() != True and refSeq.startswith("NM") == True):
            protein_name_to_ref_id[geneSymbol] = refSeq

CPU times: user 232 ms, sys: 19.3 ms, total: 251 ms
Wall time: 249 ms


### Associate reference ids to their gene sequence

In [10]:
%%time
ref_id_to_sequence = dict()
with open("../files/3UTR_RefSeq One_record_per_gene") as f:
    firstLine = f.readline()
    ref_seq_id = firstLine.split(" ")[0].split("refGene_")[1]
    mRNA = ""
    for line in f:
        if(line.startswith(">")):
            ref_id_to_sequence[ref_seq_id] = mRNA
            ref_seq_id = line.split(" ")[0].split("refGene_")[1]
            mRNA = ""
        else:
            mRNA += line[0:len(line)-1]

CPU times: user 1.65 s, sys: 195 ms, total: 1.85 s
Wall time: 1.88 s


### Associate reference ids to their randomly shuffled gene sequences

In [25]:
%%time
ref_id_to_shuffled_sequence = dict()
window_size = 10
for ref_seq_id in ref_id_to_sequence:
    gene = ref_id_to_sequence[ref_seq_id]
    window_size = 10
    gene_windows = [gene[i:i+window_size] for i in range(0, len(gene), window_size)]
    shuffled_gene = ""
    for i, geneWindow in enumerate(gene_windows): #shuffling window
        l = list(geneWindow)
        random.shuffle(l)
        shuffled_gene += ''.join(l)
    ref_id_to_shuffled_sequence[ref_seq_id] = shuffled_gene


CPU times: user 1min 20s, sys: 1.48 s, total: 1min 21s
Wall time: 1min 26s


Inpect shuffling 

In [13]:
Example_before = ref_id_to_sequence['NM_001001740']
Example_before_windows = [Example_before[i:i+window_size] for i in range(0, len(Example_before), window_size)]
Example_after = ref_seq_to_shuffled_sequence['NM_001001740']
Example_after_windows = [Example_after[i:i+window_size] for i in range(0, len(Example_after), window_size)]
print(Example_before_windows[1:5])
print(Example_after_windows[1:5])

['caagtcaaat', 'tgtacttgat', 'cctgctgaaa', 'tacatctgca']
['agcataaact', 'tattggcatt', 'tgcccgaata', 'actttaacgc']


### Retreive protein names from file

In [19]:
%%time
proteinsFound = []
with open("../files/ProteinsNames.txt") as f:
    for line in f:
        if(line.strip() in protein_name_to_ref_id):
            proteinsFound.append(line.strip())


CPU times: user 7.14 ms, sys: 4.44 ms, total: 11.6 ms
Wall time: 18.4 ms


### Associate protein name to gene sequence

In [22]:
%%time
protein_to_sequence = dict()
protein_to_shuffled_sequence = dict()
for protein in proteinsFound:
    ref_id = protein_name_to_ref_id[protein]
    if(ref_id in ref_id_to_sequence):
        sequence = ref_id_to_sequence[ref_id]
        shuffled_sequence = ref_seq_to_shuffled_sequence[ref_id]
        protein_to_sequence[protein] = sequence
        protein_to_shuffled_sequence[protein] = shuffled_sequence

CPU times: user 18.2 ms, sys: 67 ms, total: 85.3 ms
Wall time: 85.5 ms


### Associate protein name to shuffled gene sequence

In [26]:
%%time
protein_to_sequence = dict()
protein_to_shuffled_sequence = dict()
for protein in proteinsFound:
    ref_id = protein_name_to_ref_id[protein]
    if(ref_id in ref_id_to_shuffled_sequence):
        sequence = ref_id_to_shuffled_sequence[ref_id]
        shuffled_sequence = ref_seq_to_shuffled_sequence[ref_id]
        protein_to_sequence[protein] = sequence
        protein_to_shuffled_sequence[protein] = shuffled_sequence

CPU times: user 16.7 ms, sys: 59.6 ms, total: 76.3 ms
Wall time: 80.8 ms


### Creating proteins_to_sequence.plk and proteins_to_shuffled_sequences.pkl files

In [28]:
%%time
pickle.dump(protein_to_sequence, open( "../files/intermediates/proteins_to_sequences.pkl", "wb" ))
pickle.dump(protein_to_shuffled_sequence, open( "../files/intermediates/proteins_to_shuffled_sequences.pkl", "wb" ))

CPU times: user 24.3 ms, sys: 135 ms, total: 160 ms
Wall time: 249 ms


### Generating Distance Matrix 

In [29]:
%%time
distanceMatrix = dict()
with open("../files/correlation.tsv") as f:
    for line in f:
        protein_i = line.split("\t")[0]
        protein_j = line.split("\t")[1]
        correlation = float(line.split("\t")[2])
        if(protein_i not in distanceMatrix):
            distanceMatrix[protein_i] = {}
        distanceMatrix[protein_i][protein_j] = correlation
df = pandas.DataFrame(distanceMatrix)


CPU times: user 17.3 s, sys: 1.79 s, total: 19.1 s
Wall time: 21.4 s


### Creating distanceMatrix.pkl file

In [27]:
%%time
df.to_pickle(open( "files/intermediates/distanceMatrix.pkl", "wb" ))

CPU times: user 66.2 ms, sys: 150 ms, total: 217 ms
Wall time: 795 ms


peek into distance matrix

In [40]:
distanceMatrix['ABCB7']

{'ABCB7': 1.0,
 'DHX30': 0.723889065649033,
 'SLC30A9': 0.61909299921141,
 'KIAA0391': 0.655535869033045,
 'AFG3L2': 0.685247051445739,
 'GLUD1': 0.662208223890299,
 'NDUFAF4': 0.677647873332862,
 'ATPAF1': 0.771016096674887,
 'ATP5B': 0.690403259940054,
 'ETFA': 0.563521799233124,
 'IARS2': 0.585490461103937,
 'MDH2': 0.566997009800723,
 'OAT': 0.551060317518727,
 'ERAL1': 0.592034444517862,
 'PPIF': 0.613465087012541,
 'MTHFD1L': 0.640292628602629,
 'NDUFV3': 0.596885777455722,
 'LETM1': 0.596595641790333,
 'NDUFAF2': 0.607338603115212,
 'ACOT1': 0.580784673989049,
 'LRPPRC': 0.58689944012214,
 'MRPS31': 0.580036006931087,
 'SHMT2': 0.588344481517721,
 'TRMT10C': 0.643225753999404,
 'EARS2': 0.630083344392611,
 'TACO1': 0.609090108807148,
 'MRM3': 0.602647294558302,
 'MTHFD2': 0.593293477274748,
 'MRPL48': 0.584861275263382,
 'QRSL1': 0.58417542524104,
 'GTPBP10': 0.578776689725141,
 'GRSF1': 0.577407405162721,
 'MRPS14': 0.573462392443653,
 'MRPL45': 0.555278422205731,
 'MRPS23': 0.

In [42]:
distanceMatrix['SLC30A9']

{'ABCB7': 0.61909299921141,
 'DHX30': 0.818845841054781,
 'SLC30A9': 1.0,
 'KIAA0391': 0.832650130813785,
 'AFG3L2': 0.827844339087838,
 'GLUD1': 0.752229546720549,
 'NDUFAF4': 0.752070044142159,
 'ATPAF1': 0.712829144298571,
 'ATP5B': 0.735225546856152,
 'ETFA': 0.670113306063479,
 'IARS2': 0.667695626949346,
 'MDH2': 0.636870710176698,
 'OAT': 0.71884198626612,
 'ERAL1': 0.722530728302722,
 'PPIF': 0.747368637973769,
 'MTHFD1L': 0.73301592606104,
 'NDUFV3': 0.770968960663764,
 'LETM1': 0.803873851492856,
 'NDUFAF2': 0.781787068798573,
 'ACOT1': 0.798116330355295,
 'LRPPRC': 0.820026975426514,
 'MRPS31': 0.810576650413257,
 'SHMT2': 0.806700388300229,
 'TRMT10C': 0.81541391712412,
 'EARS2': 0.830283600601636,
 'TACO1': 0.807265953380839,
 'MRM3': 0.797508481431935,
 'MTHFD2': 0.798613385972248,
 'MRPL48': 0.771382785392079,
 'QRSL1': 0.770374233396815,
 'GTPBP10': 0.780159417475769,
 'GRSF1': 0.789956826407026,
 'MRPS14': 0.79516329910103,
 'MRPL45': 0.769700394796376,
 'MRPS23': 0.73

In [39]:
distanceMatrix.keys()

dict_keys(['ABCB7', 'DHX30', 'SLC30A9', 'KIAA0391', 'AFG3L2', 'GLUD1', 'NDUFAF4', 'ATPAF1', 'ATP5B', 'ETFA', 'IARS2', 'MDH2', 'OAT', 'ERAL1', 'PPIF', 'MTHFD1L', 'NDUFV3', 'LETM1', 'NDUFAF2', 'ACOT1', 'LRPPRC', 'MRPS31', 'SHMT2', 'TRMT10C', 'EARS2', 'TACO1', 'MRM3', 'MTHFD2', 'MRPL48', 'QRSL1', 'GTPBP10', 'GRSF1', 'MRPS14', 'MRPL45', 'MRPS23', 'RBFA', 'NGRN', 'MRPL10', 'MRPL42', 'LOC100507855', 'OXSM', 'MTRF1L', 'HSPE1', 'BCS1L', 'C8orf82', 'MRPS17', 'NDUFA6', 'CDK5RAP1', 'TRMT61B', 'HSDL2', 'MRPS36', 'FECH', 'PDE12', 'SUCLA2', 'NME4', 'MRPL46', 'C17orf80', 'MRPS24', 'TIMM44', 'GLS', 'SDHA', 'MGME1', 'METTL15', 'MRPS28', 'NFS1', 'POLG', 'MMAB', 'TEFM', 'NDUFAF5', 'SUPV3L1', 'NIPSNAP1', 'THNSL1', 'ABHD10', 'MRPL24', 'ATPIF1', 'NDUFS6', 'FLAD1', 'PREPL', 'VWA8', 'DHTKD1', 'PNPLA8', 'PDK1', 'CARS2', 'RTN4IP1', 'PNPT1', 'ALAS1', 'SDHAF3', 'PPA2', 'GRPEL1', 'NDUFA12', 'PTCD3', 'PYCR1', 'MRPS26', 'FASTKD5', 'ACOT9', 'ACADSB', 'MRPL44', 'ATP5J2-PTCD1', 'FASTKD2', 'COQ3', 'HARS2', 'NOA1', 'GUF1

In [43]:
df = pandas.DataFrame(distanceMatrix)

df

Unnamed: 0,56160856,AAK1,AARS2,AASS,ABCB10,ABCB7,ABCF3,ABHD10,ACAD9,ACADM,...,ZNF507,ZNF598,ZNF622,ZNF638,ZNF697,ZNF787,ZNF830,ZWINT,ZYX,ZZZ3
56160512,-0.015655,-0.020762,-0.022708,-0.031831,-0.028780,-0.027491,-0.030291,-0.020125,-0.020846,-0.015889,...,0.024666,-0.054570,-0.033699,0.073136,-0.012532,0.087311,0.076583,-0.021280,0.003775,-0.026859
56160531,-0.011067,-0.086447,-0.019275,-0.004689,0.010660,0.238263,-0.001954,-0.061435,-0.047883,-0.048504,...,0.008848,-0.047359,-0.032180,-0.036339,-0.038256,0.028807,-0.067984,0.100703,-0.083606,0.027269
56160856,1.000000,0.110500,-0.017685,-0.024790,-0.022413,-0.031723,-0.023590,-0.015673,-0.016234,-0.012374,...,0.016982,-0.012354,-0.026244,-0.025988,-0.009759,-0.016277,-0.024091,-0.016573,0.699519,-0.020918
AAAS,-0.024937,-0.087532,-0.050126,0.007796,-0.002081,0.456438,-0.073135,-0.055001,-0.042159,-0.043424,...,-0.056583,-0.054813,0.033866,-0.085041,-0.022629,-0.052719,-0.074226,-0.040125,-0.101689,-0.054793
AAK1,0.110500,1.000000,-0.039770,-0.054362,-0.034099,-0.062671,0.042620,-0.035246,-0.036509,-0.027827,...,0.071567,0.419345,-0.025450,-0.053783,-0.021948,-0.036606,-0.046397,-0.027056,0.199705,-0.038515
AAR2,-0.012350,-0.023553,-0.016624,-0.008013,0.066443,0.018457,-0.016003,-0.015876,-0.014924,-0.012535,...,-0.014913,-0.049454,0.074512,-0.021084,-0.009886,-0.008996,-0.012208,-0.012263,-0.032756,0.000215
AARS2,-0.017685,-0.039770,1.000000,0.863452,0.796115,0.599567,-0.034219,0.855463,0.943497,0.889447,...,-0.044009,-0.062194,-0.028411,-0.007320,-0.014157,-0.023611,-0.034945,-0.024040,-0.058007,-0.030342
AASDH,-0.013856,-0.031161,-0.020099,-0.028175,0.039677,0.007754,-0.026811,-0.017813,-0.018451,-0.014063,...,0.034454,-0.059902,0.020199,0.113514,-0.011092,0.061050,0.073202,-0.018835,-0.045450,-0.023774
AASS,-0.024790,-0.054362,0.863452,1.000000,0.702467,0.591246,-0.047968,0.736564,0.844448,0.740291,...,-0.052519,-0.090542,0.065844,-0.003694,-0.019845,-0.022514,-0.033935,-0.033698,-0.079070,-0.015168
AATF,-0.027054,-0.060841,-0.039243,-0.055010,-0.046698,-0.047271,-0.052348,-0.034779,-0.036025,-0.027458,...,-0.064264,-0.104084,0.040697,0.445741,-0.021657,0.192122,0.097620,-0.036776,-0.088739,0.039934


Biological network mean and standard deviation

In [31]:
means = df.mean()
stds = df.std()
meancorr = sum(means)/len(means) #average corelation
stdCorr = sum(stds)/len(stds) #average corelation
print(meancorr)
print(stdCorr)

0.0072822421276182905
0.12065653955593474


In [36]:
with open("../files/correlation.tsv") as f:
    print(f.readline())
    print(f.readline()) 

ABCB7	ABCB7	1

ABCB7	DHX30	0.723889065649033	



In [44]:
%%time
pandas.read_pickle(open( "../files/intermediates/distanceMatrix.pkl", "rb" ))

UnpicklingError: pickle exhausted before end of frame