In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
os.chdir("data/aws_predictions/")

In [4]:
info_data_path = "../model_training/raw_data/data.info"
with open(info_data_path, 'r') as f:
    info = f.read().splitlines()

info_list = [info[i].split(",") for i in range(len(info))]
info_df = pd.DataFrame(info_list[1:]) 
info_df.columns = info_list[0]

In [5]:
info_df["transcript_position"] = info_df["transcript_position"].astype(str).astype(int)
info_df["label"] = info_df["label"].astype(str).astype(int)
info_df.dtypes

gene_id                object
transcript_id          object
transcript_position     int64
label                   int64
dtype: object

In [6]:
info_label1 = info_df[info_df["label"] == 1]
info_label1.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label
17,ENSG00000004059,ENST00000000233,913,1
52,ENSG00000003056,ENST00000000412,2440,1
53,ENSG00000003056,ENST00000000412,2462,1
55,ENSG00000003056,ENST00000000412,2499,1
229,ENSG00000007520,ENST00000007390,1096,1


# A549

In [87]:
a549_files = sorted([file for file in os.listdir() if file.startswith("A549")])
a549 = [pd.read_csv(file) for file in a549_files]
print(a549_files)

['A549_rep5_run1_prediction.csv', 'A549_rep6_run1_prediction.csv']


In [88]:
## LEFT JOIN WITH INFO
a549 = [info_label1.merge(df, how = "left", left_on = ["transcript_id", "transcript_position"],
                                right_on = ["transcript", "position"])
                                for df in a549]
a549[0].head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label,transcript,position,score
0,ENSG00000004059,ENST00000000233,913,1,ENST00000000233,913.0,0.064488
1,ENSG00000003056,ENST00000000412,2440,1,ENST00000000412,2440.0,0.608392
2,ENSG00000003056,ENST00000000412,2462,1,ENST00000000412,2462.0,0.077398
3,ENSG00000003056,ENST00000000412,2499,1,ENST00000000412,2499.0,0.422727
4,ENSG00000007520,ENST00000007390,1096,1,ENST00000007390,1096.0,0.677273


In [89]:
## DROP DUPLICATE COLUMNS
for df in a549:
    df.drop(["transcript", "position"], axis = 1, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')


In [90]:
## RENAME COLS
for df, ind in zip(a549, ["5", "6"]):
    df.rename(columns = {"label": f"rep{ind}_label", "score": f"rep{ind}_score"}, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'rep5_label',
       'rep5_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep6_label',
       'rep6_score'],
      dtype='object')


In [91]:
## get similar transcript and position with label 1
a549_label1 = pd.concat(a549, axis = 1, join = "inner")

## drop duplicate columns
a549_label1 = a549_label1.loc[:, ~a549_label1.columns.duplicated()]

print(a549_label1.shape)
a549_label1.head()

(5475, 7)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep5_label,rep5_score,rep6_label,rep6_score
0,ENSG00000004059,ENST00000000233,913,1,0.064488,1,0.168528
1,ENSG00000003056,ENST00000000412,2440,1,0.608392,1,0.7
2,ENSG00000003056,ENST00000000412,2462,1,0.077398,1,0.0367
3,ENSG00000003056,ENST00000000412,2499,1,0.422727,1,0.363636
4,ENSG00000007520,ENST00000007390,1096,1,0.677273,1,0.459091


# HCT116

In [92]:
hct116_files = sorted([file for file in os.listdir() if file.startswith("Hct116")])
hct116 = [pd.read_csv(file) for file in hct116_files]
print(hct116_files)

['Hct116_rep3_run1_prediction.csv', 'Hct116_rep3_run4_prediction.csv', 'Hct116_rep4_run3_prediction.csv']


In [93]:
## LEFT JOIN WITH INFO
hct116 = [info_label1.merge(df, how = "left", left_on = ["transcript_id", "transcript_position"],
                                right_on = ["transcript", "position"])
                                for df in hct116]

In [94]:
## DROP DUPLICATE COLUMNS
for df in hct116:
    df.drop(["transcript", "position"], axis = 1, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')


In [95]:
## RENAME COLS
for df, ind in zip(hct116, ["3_run1", "3_run4", "4_run3"]):
    df.rename(columns = {"label": f"rep{ind}_label", "score": f"rep{ind}_score"}, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'rep3_run1_label',
       'rep3_run1_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep3_run4_label',
       'rep3_run4_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep4_run3_label',
       'rep4_run3_score'],
      dtype='object')


In [96]:
## get similar transcript and position with label 1
hct116_label1 = pd.concat(hct116, axis = 1, join = "inner")

## drop duplicate columns
hct116_label1 = hct116_label1.loc[:, ~hct116_label1.columns.duplicated()]

print(hct116_label1.shape)
hct116_label1.head()

(5475, 9)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep3_run1_label,rep3_run1_score,rep3_run4_label,rep3_run4_score,rep4_run3_label,rep4_run3_score
0,ENSG00000004059,ENST00000000233,913,1,0.5,1,0.391058,1,0.345604
1,ENSG00000003056,ENST00000000412,2440,1,0.818182,1,0.668182,1,0.672727
2,ENSG00000003056,ENST00000000412,2462,1,0.383295,1,0.054696,1,0.01833
3,ENSG00000003056,ENST00000000412,2499,1,0.281818,1,0.268182,1,0.077273
4,ENSG00000007520,ENST00000007390,1096,1,0.822727,1,0.459091,1,0.495455


# HEPG2

In [97]:
hepg2_files = sorted([file for file in os.listdir() if file.startswith("hepG2")])
hepg2 = [pd.read_csv(file) for file in hepg2_files]
print(hepg2_files)

['hepG2_rep5_run2_prediction.csv', 'hepG2_rep6_run1_prediction.csv']


In [98]:
## LEFT JOIN WITH INFO
hepg2 = [info_label1.merge(df, how = "left", left_on = ["transcript_id", "transcript_position"],
                                right_on = ["transcript", "position"])
                                for df in hepg2]

In [99]:
## DROP DUPLICATE COLUMNS
for df in hepg2:
    df.drop(["transcript", "position"], axis = 1, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')


In [100]:
## RENAME COLS
for df, ind in zip(hepg2, ["5", "6"]):
    df.rename(columns = {"label": f"rep{ind}_label", "score": f"rep{ind}_score"}, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'rep5_label',
       'rep5_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep6_label',
       'rep6_score'],
      dtype='object')


In [101]:
## get similar transcript and position with label 1
hepg2_label1 = pd.concat(hepg2, axis = 1, join = "inner")

## drop duplicate columns
hepg2_label1 = hepg2_label1.loc[:, ~hepg2_label1.columns.duplicated()]

print(hepg2_label1.shape)
hepg2_label1.head()

(5475, 7)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep5_label,rep5_score,rep6_label,rep6_score
0,ENSG00000004059,ENST00000000233,913,1,0.168331,1,0.165614
1,ENSG00000003056,ENST00000000412,2440,1,0.559091,1,0.522727
2,ENSG00000003056,ENST00000000412,2462,1,0.073044,1,0.004742
3,ENSG00000003056,ENST00000000412,2499,1,0.281818,1,0.231818
4,ENSG00000007520,ENST00000007390,1096,1,0.327273,1,0.404545


# K562

In [102]:
k562_files = sorted([file for file in os.listdir() if file.startswith("k562")])
k562 = [pd.read_csv(file) for file in k562_files]
print(k562_files)

['k562_rep4_run1_prediction.csv', 'k562_rep5_run1_prediction.csv', 'k562_rep6_run1_prediction.csv']


In [103]:
## LEFT JOIN WITH INFO
k562 = [info_label1.merge(df, how = "left", left_on = ["transcript_id", "transcript_position"],
                                right_on = ["transcript", "position"])
                                for df in k562]

In [104]:
## DROP DUPLICATE COLUMNS
for df in k562:
    df.drop(["transcript", "position"], axis = 1, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')


In [105]:
## RENAME COLS
for df, ind in zip(k562, ["4", "5", "6"]):
    df.rename(columns = {"label": f"rep{ind}_label", "score": f"rep{ind}_score"}, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'rep4_label',
       'rep4_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep5_label',
       'rep5_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep6_label',
       'rep6_score'],
      dtype='object')


In [106]:
## get similar transcript and position with label 1
k562_label1 = pd.concat(k562, axis = 1, join = "inner")

## drop duplicate columns
k562_label1 = k562_label1.loc[:, ~k562_label1.columns.duplicated()]

print(k562_label1.shape)
k562_label1.head()

(5475, 9)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep4_label,rep4_score,rep5_label,rep5_score,rep6_label,rep6_score
0,ENSG00000004059,ENST00000000233,913,1,0.222738,1,0.164011,1,0.19568
1,ENSG00000003056,ENST00000000412,2440,1,0.495455,1,0.613636,1,0.586364
2,ENSG00000003056,ENST00000000412,2462,1,0.05,1,0.063636,1,0.050192
3,ENSG00000003056,ENST00000000412,2499,1,0.372727,1,0.336364,1,0.272727
4,ENSG00000007520,ENST00000007390,1096,1,0.418182,1,0.4,1,0.586364


# MCF7

In [107]:
mcf7_files = sorted([file for file in os.listdir() if file.startswith("mcf7")])
mcf7 = [pd.read_csv(file) for file in mcf7_files]
print(mcf7_files)

['mcf7_rep3_run1_prediction.csv', 'mcf7_rep4_run1_prediction.csv']


In [108]:
## LEFT JOIN WITH INFO
mcf7 = [info_label1.merge(df, how = "left", left_on = ["transcript_id", "transcript_position"],
                                right_on = ["transcript", "position"])
                                for df in mcf7]

In [109]:
## DROP DUPLICATE COLUMNS
for df in mcf7:
    df.drop(["transcript", "position"], axis = 1, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'label', 'score'], dtype='object')


In [110]:
## RENAME COLS
for df, ind in zip(mcf7, ["3", "4"]):
    df.rename(columns = {"label": f"rep{ind}_label", "score": f"rep{ind}_score"}, inplace = True)
    print(df.columns)

Index(['gene_id', 'transcript_id', 'transcript_position', 'rep3_label',
       'rep3_score'],
      dtype='object')
Index(['gene_id', 'transcript_id', 'transcript_position', 'rep4_label',
       'rep4_score'],
      dtype='object')


In [111]:
## get similar transcript and position with label 1
mcf7_label1 = pd.concat(mcf7, axis = 1, join = "inner")

## drop duplicate columns
mcf7_label1 = mcf7_label1.loc[:, ~mcf7_label1.columns.duplicated()]

print(mcf7_label1.shape)
mcf7_label1.head()

(5475, 7)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep3_label,rep3_score,rep4_label,rep4_score
0,ENSG00000004059,ENST00000000233,913,1,0.341058,1,0.341058
1,ENSG00000003056,ENST00000000412,2440,1,0.559091,1,0.559091
2,ENSG00000003056,ENST00000000412,2462,1,0.087503,1,0.087503
3,ENSG00000003056,ENST00000000412,2499,1,0.159091,1,0.159091
4,ENSG00000007520,ENST00000007390,1096,1,0.390909,1,0.390909


# Concat all cell lines to find similar

In [112]:
all_lines = [a549_label1, hct116_label1, hepg2_label1, k562_label1, mcf7_label1]
all_lines_label1 = pd.concat(all_lines, axis = 1, join = "inner")

## drop duplicate columns
all_lines_label1 = all_lines_label1.loc[:, ~all_lines_label1.columns.duplicated()]

print(all_lines_label1.shape)
all_lines_label1.head()

(5475, 17)


Unnamed: 0,gene_id,transcript_id,transcript_position,rep5_label,rep5_score,rep6_label,rep6_score,rep3_run1_label,rep3_run1_score,rep3_run4_label,rep3_run4_score,rep4_run3_label,rep4_run3_score,rep4_label,rep4_score,rep3_label,rep3_score
0,ENSG00000004059,ENST00000000233,913,1,0.064488,1,0.168528,1,0.5,1,0.391058,1,0.345604,1,0.222738,1,0.341058
1,ENSG00000003056,ENST00000000412,2440,1,0.608392,1,0.7,1,0.818182,1,0.668182,1,0.672727,1,0.495455,1,0.559091
2,ENSG00000003056,ENST00000000412,2462,1,0.077398,1,0.0367,1,0.383295,1,0.054696,1,0.01833,1,0.05,1,0.087503
3,ENSG00000003056,ENST00000000412,2499,1,0.422727,1,0.363636,1,0.281818,1,0.268182,1,0.077273,1,0.372727,1,0.159091
4,ENSG00000007520,ENST00000007390,1096,1,0.677273,1,0.459091,1,0.822727,1,0.459091,1,0.495455,1,0.418182,1,0.390909


In [115]:
a549_label1["rep6_label"].value_counts()

1    5475
Name: rep6_label, dtype: int64