In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

<h1 align="center"> Data Compilation </h1>
<h3 align="center"> Christine Zhang </h3>

---

## Mutation Data Prep

---

In [3]:
import numpy as np
import pandas as pd

In [4]:
mut_pd = pd.read_csv('../Seashore-ludlow/v22.anno.ccl_mut_features.txt', sep ="\t")
mut_pd["entrez_id"] = mut_pd["hugo_gene_symbol"]
mut_pd.head(n=5)

Unnamed: 0,mutation_call,hugo_gene_symbol,cell_line_feature,index_ccl,entrez_id
0,CCLE Hybrid Capture Mutations,AAK1,AAK1_p._ANY_CODING_MUTATION,2;3;4;6;7;8;10;12;13;15;16;18;19;20;22;23;24;2...,AAK1
1,CCLE Hybrid Capture Mutations,AAK1,AAK1_p.Q541,2;3;4;6;7;8;10;12;13;15;16;18;19;20;22;23;24;2...,AAK1
2,CCLE Hybrid Capture Mutations,AAK1,AAK1_p.Q541indel2>Q,2;3;4;6;7;8;10;12;13;15;16;18;19;20;22;23;24;2...,AAK1
3,CCLE Hybrid Capture Mutations,AATK,AATK_p._ANY_CODING_MUTATION,38;60;72;114;126;145;149;157;183;207;226;269;3...,AATK
4,CCLE Hybrid Capture Mutations,ABCA3,ABCA3_p._ANY_CODING_MUTATION,8;36;52;67;113;126;127;128;130;131;132;133;135...,ABCA3


### Hugo to Entrez Conversion

---

In [7]:
entrez_pd = pd.read_csv('../EntrezID.txt', skiprows = [0], sep ="\t", names = ["Hugo", "Symbol", "Entrez_ID"]) 
entrez_pd.head(n=10)

Unnamed: 0,Hugo,Symbol,Entrez_ID
0,HGNC:5,A1BG,1.0
1,HGNC:37133,A1BG-AS1,503538.0
2,HGNC:24086,A1CF,29974.0
3,HGNC:6,A1S9T~withdrawn,
4,HGNC:7,A2M,2.0
5,HGNC:27057,A2M-AS1,144571.0
6,HGNC:23336,A2ML1,144568.0
7,HGNC:41022,A2ML1-AS1,100874108.0
8,HGNC:41523,A2ML1-AS2,106478979.0
9,HGNC:8,A2MP1,3.0


In [45]:
for symbol in mut_pd["hugo_gene_symbol"]:
    entrez_index = entrez_pd.index[entrez_pd['Symbol'] == symbol].tolist()
    if entrez_index != []:
        entrez_ID = entrez_pd.get_value(entrez_index[0], "Entrez_ID")
        mut_pd["entrez_id"].replace(symbol, entrez_ID, inplace=True)
    
    # if no entrez_id, drop row
    else:
        mut_pd.drop(mut_pd.index[mut_pd["hugo_gene_symbol"] == symbol], inplace = True)
mut_pd.head(n=5)

Unnamed: 0,mutation_call,hugo_gene_symbol,cell_line_feature,index_ccl,entrez_id
2,CCLE Hybrid Capture Mutations,AAK1,AAK1_p.Q541indel2>Q,2;3;4;6;7;8;10;12;13;15;16;18;19;20;22;23;24;2...,22848.0
3,CCLE Hybrid Capture Mutations,AATK,AATK_p._ANY_CODING_MUTATION,38;60;72;114;126;145;149;157;183;207;226;269;3...,9625.0
4,CCLE Hybrid Capture Mutations,ABCA3,ABCA3_p._ANY_CODING_MUTATION,8;36;52;67;113;126;127;128;130;131;132;133;135...,21.0
5,CCLE Hybrid Capture Mutations,ABCA3,ABCA3_p.L654,36;113;171;209,21.0
6,CCLE Hybrid Capture Mutations,ABCA3,ABCA3_p.L654V,36;113;171;209,21.0


In [48]:
mut_filtered_pd = mut_pd[mut_pd["cell_line_feature"].str.contains("ANY_CODING_MUTATION")]
mut = mut_filtered_pd.as_matrix()
mut_filtered_pd.head(n=5)

Unnamed: 0,mutation_call,hugo_gene_symbol,cell_line_feature,index_ccl,entrez_id
3,CCLE Hybrid Capture Mutations,AATK,AATK_p._ANY_CODING_MUTATION,38;60;72;114;126;145;149;157;183;207;226;269;3...,9625.0
4,CCLE Hybrid Capture Mutations,ABCA3,ABCA3_p._ANY_CODING_MUTATION,8;36;52;67;113;126;127;128;130;131;132;133;135...,21.0
9,CCLE Hybrid Capture Mutations,ABCC3,ABCC3_p._ANY_CODING_MUTATION,17;66;128;131;133;135;157;168;182;193;198;202;...,8714.0
10,CCLE Hybrid Capture Mutations,ABCC4,ABCC4_p._ANY_CODING_MUTATION,4;13;103;126;128;132;135;138;145;182;183;198;2...,10257.0
13,CCLE Hybrid Capture Mutations,ABI1,ABI1_p._ANY_CODING_MUTATION,31;56;89;132;135;141;150;203;215;219;220;230;4...,10006.0


In [49]:
len(mut_filtered_pd["hugo_gene_symbol"].drop_duplicates())

1529

### Index CCL Conversion

---

In [50]:
index_pd = pd.read_csv('../Seashore-ludlow/v22.meta.per_cell_line.txt', sep ="\t")
correlation_pd = index_pd[["index_ccl", "master_ccl_id"]]
correlation = correlation_pd.as_matrix()
correlation_pd.head(n=10)

Unnamed: 0,index_ccl,master_ccl_id
0,1,144
1,2,475
2,3,528
3,4,571
4,5,572
5,6,678
6,7,720
7,8,852
8,9,1030
9,10,1051


In [71]:
# reformat array by ccl
mut_by_ccl = []
for row in range(0, len(mut)):
    for each_mut in mut[row][-2].split(";"):
        mut_by_ccl.append([int(each_mut), mut[row][1], mut[row][-1]])
mut_by_ccl_sorted = sorted(mut_by_ccl, key = lambda x: int(x[2]))
mut_sorted = np.array(mut_by_ccl_sorted)

# covert index ccl to master ccl id
corr_lst = list(correlation[:,0])
for mut_index, row in enumerate(mut_sorted):
    if int(row[0]) in correlation[:,0]:
        corr_index = corr_lst.index(int(row[0]))
        mut_sorted[mut_index, 0] = correlation[corr_index, 1]
print mut_sorted

[['852' 'ABCA3' '21.0']
 ['1091' 'ABCA3' '21.0']
 ['305' 'ABCA3' '21.0']
 ..., 
 ['473' 'CDK11A' '728642.0']
 ['618' 'CDK11A' '728642.0']
 ['70' 'CDK11A' '728642.0']]


## Copy number  prep

---

In [202]:
copynum_pd = pd.read_csv('../Rees/v21.data.cnv_avg_log2.txt', skiprows = [0], sep ="\t", names = ["master_ccl_id", "entrez_id", "copy_num"])
copynum = copynum_pd.as_matrix()
copynum_pd.head(n=10)

Unnamed: 0,master_ccl_id,entrez_id,copy_num
0,1,1,0.0661
1,1,503538,0.0661
2,1,29974,-0.008
3,1,2,-0.0233
4,1,144571,-0.0233
5,1,144568,-0.0233
6,1,3,-0.0233
7,1,53947,0.0229
8,1,51146,-0.0459
9,1,100506677,-0.0143


In [203]:
master_ccl = copynum_pd["entrez_id"].drop_duplicates()
len(master_ccl)

23174

## AUC prep

---

In [54]:
auc_pd = pd.read_csv('dataset1.csv', skiprows = [0,1], sep =",", 
                          names = ["Cclid", "Cellline", "Sites", "Histology", "Subhistology", "Culture", "SNP", "AUC"])
auc = auc_pd.as_matrix()
auc_pd.head(n=10)

Unnamed: 0,Cclid,Cellline,Sites,Histology,Subhistology,Culture,SNP,AUC
0,3,5637,urinary_tract,carcinoma,NS,RPMI001,SNP-matched-reference,2.473
1,7,22RV1,prostate,carcinoma,NS,RPMI001,SNP-matched-reference,2.142
2,10,42MGBA,central_nervous_system,glioma,astrocytoma_Grade_IV,RPMIMEM001,SNP-matched-reference,3.38
3,15,647V,urinary_tract,carcinoma,transitional_cell_carcinoma,DMEM011,SNP-matched-reference,3.374
4,16,769P,kidney,carcinoma,clear_cell_renal_cell_carcinoma,RPMI001,SNP-matched-reference,3.123
5,19,786O,kidney,carcinoma,clear_cell_renal_cell_carcinoma,RPMI001,SNP-matched-reference,3.779
6,21,8505C,thyroid,carcinoma,anaplastic_carcinoma,EMEM001,SNP-matched-reference,3.471
7,22,8MGBA,central_nervous_system,glioma,astrocytoma_Grade_IV,EMEM005,SNP-matched-reference,2.625
8,23,A101D,skin,malignant_melanoma,NS,DMEM001,SNP-matched-reference,4.677
9,25,A172,central_nervous_system,glioma,astrocytoma_Grade_IV,DMEM001,SNP-matched-reference,2.751


## Dataset Compilation

---

In [55]:
def intersect (first, second, combined, toggle, type_var):
    if toggle == 1:
        first = list(set(first[:,0]))
        second = list(set(second[:,0]))
    for row in first:
        if type_var == "str":
            if str(row) in second:
                combined.append(row)
        if type_var == "float":
            if float(row) in second:
                combined.append(row)
        if type_var == "int":
            if int(row) in second:
                combined.append(row)
    return combined, len(combined)

In [136]:
auc_mut, mut_copynum, auc_mut_copynum = [], [], []

auc_mut_lst, auc_mut_len = intersect (auc, mut_sorted, auc_mut, 1, "str")
mut_copynum_lst, mut_copynum_len = intersect (auc, copynum, mut_copynum, 1, "float")
all_overlap, all_len = intersect (auc_mut_lst, mut_copynum_lst, auc_mut_copynum, 0, "int")

print "Total Number of Master CCL Ids: " + str(all_len)

Total Number of Master CCL Ids: 472


In [145]:
all_sorted = np.sort(all_overlap)
compiled_pd = pd.DataFrame([], [], all_sorted)
compiled_pd['Entrez_id'] = []
compiled_pd

Unnamed: 0,3,7,10,15,16,19,21,22,23,25,...,155493,155498,155502,155505,155510,155511,155513,155518,155520,Entrez_id


### Add Mutation

---

In [153]:
entrez_list = list(set(mut_sorted[:,2]))
for id_index, each_id in enumerate(entrez_list):
    val = [int(i) for i,j,k in mut_sorted if k == each_id]
    new_row = np.sort(all_overlap)
    for index, entry in enumerate(new_row):
        if entry in val:
            new_row[index] = 1
        else: 
            new_row[index] = 0
    new_row = np.append(new_row, float(each_id))
    compiled_pd.loc[id_index] = new_row

compiled_pd.head(n=10)

Unnamed: 0,3,7,10,15,16,19,21,22,23,25,...,155493,155498,155502,155505,155510,155511,155513,155518,155520,Entrez_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,890.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4478.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2317.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6790.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120892.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4342.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4739.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6498.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8313.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10298.0


### Insert copy number

---

In [205]:
copynum_pd.sort_values("entrez_id")

Unnamed: 0,master_ccl_id,entrez_id,copy_num
0,1,1,0.0661
9084208,635,1,-0.0375
278088,23,1,0.1655
1459962,111,1,0.6705
15920538,1195,1,0.0250
17287804,128001,1,-0.3026
11540652,930,1,-0.4521
834264,57,1,0.0015
6488720,480,1,0.2883
5816674,441,1,-0.2309


### Insert cclid

---

In [190]:
for cclid in auc_pd["Cclid"]:
    if cclid not in all_sorted:
        auc_pd.drop(auc_pd.index[auc_pd["Cclid"] == cclid], inplace = True)
auc_lst = list(auc_pd["AUC"])
auc_lst.insert(len(auc_lst), 0)

In [194]:
compiled_pd.loc[len(compiled_pd)] = auc_lst
compiled_pd.tail(n=10)

Unnamed: 0,3,7,10,15,16,19,21,22,23,25,...,155493,155498,155502,155505,155510,155511,155513,155518,155520,Entrez_id
1521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2885.0
1522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2161.0
1523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,27255.0
1524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,23072.0
1525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6725.0
1526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9149.0
1527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6241.0
1528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8301.0
2000,2.473,2.142,3.38,3.374,3.123,3.779,3.471,2.625,4.677,2.751,...,3.071,5.351,3.929,3.912,4.913,3.755,1.712,4.61,5.46,0.0
1530,2.473,2.142,3.38,3.374,3.123,3.779,3.471,2.625,4.677,2.751,...,3.071,5.351,3.929,3.912,4.913,3.755,1.712,4.61,5.46,0.0
