# Random Forest for Prediction

In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [3]:
rfe_features = ['std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min',
       'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min',
       'dwelling_time_0_50', 'dwelling_time_0_mean', 'std_0_25', 'std_0_50',
       'std_0_75', 'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25',
       'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 'mean_0_max',
       'dwelling_time_+1_mean', 'std_+1_25', 'std_+1_50', 'mean_+1_25',
       'mean_+1_50', 'mean_+1_75', 'mean_+1_mean', 'mean_+1_min',
       'mean_+1_max', 'relative_position', 'position_1_G', 'position_5_T']

In [2]:
# Load pickled random forest model
import joblib
pickled_model = joblib.load('../modelling/rf.pkl')

# Prediction

In [6]:
def prediction(csv_fpath):
    ## read csv file
    data = pd.read_csv(csv_fpath)
    print(data.shape)

    ## save transcript and position col for concatenation later
    data_id_col = data[["transcript", "position"]]

    ## predict using rfc
    data_pred = pickled_model.predict_proba(data[rfe_features])[:,1]
    print(len(data_pred))

    ## convert predictions to dataframe
    data_pred_df = pd.DataFrame(data_pred, columns = ['score'])

    ## 
    data_pred_df = pd.concat([data_id_col, data_pred_df], axis = 1)
    print(f"Prediction file is of shape: {data_pred_df.shape}")

    return data_pred_df

In [14]:
A549_rep6_run1_path = 'A549_rep6_run1.csv'
A549_rep6_run1_pred = prediction(A549_rep6_run1_path)
A549_rep6_run1_pred.head()


(120346, 76)
120346
Prediction file is of shape: (120346, 3)


Unnamed: 0,transcript,position,score
0,ENST00000373020,1006,0.031818
1,ENST00000373020,1013,0.009091
2,ENST00000373020,1149,0.0
3,ENST00000373020,512,0.036364
4,ENST00000373020,689,0.027273


In [15]:
A549_rep5_run1_path = 'A549_rep5_run1.csv'
A549_rep5_run1_pred = prediction(A549_rep5_run1_path)
A549_rep5_run1_pred.head()


(119517, 76)
119517
Prediction file is of shape: (119517, 3)


Unnamed: 0,transcript,position,score
0,ENST00000373020,1006,0.004545
1,ENST00000373020,1013,0.009091
2,ENST00000373020,1149,0.004545
3,ENST00000373020,512,0.013636
4,ENST00000373020,689,0.004545


In [16]:
A549_rep6_run1_pred.to_csv("../data/aws_predictions/A549_rep6_run1_prediction.csv", index = False)
A549_rep5_run1_pred.to_csv("../data/aws_predictions/A549_rep5_run1_prediction.csv", index = False)

In [None]:
k562_rep4_run1_path = "/Users/claudia/Downloads/K562_rep4_run1.csv"
k562_rep4_run1_pred = prediction(k562_rep4_run1_path)
k562_rep4_run1_pred.head()

In [7]:
k562_rep5_run1_path = "/Users/claudia/Downloads/K562_rep5_run1.csv"
k562_rep5_run1_pred = prediction(k562_rep5_run1_path)
k562_rep5_run1_pred.head()

(116519, 76)
116519
Prediction file is of shape: (116519, 3)


Unnamed: 0,transcript,position,score
0,ENST00000371582,1030,0.004545
1,ENST00000371582,105,0.013636
2,ENST00000371582,1123,0.018182
3,ENST00000371582,147,0.072751
4,ENST00000371582,242,0.045583


In [10]:
k562_rep5_run1_pred.to_csv("../data/aws_predictions/k562_rep5_run1_prediction.csv")

In [11]:
k562_rep6_run1_path = "/Users/claudia/Downloads/K562_rep6_run1.csv"
k562_rep6_run1_pred = prediction(k562_rep6_run1_path)
k562_rep6_run1_pred.head()

(99502, 76)
99502
Prediction file is of shape: (99502, 3)


Unnamed: 0,transcript,position,score
0,ENST00000373020,1013,0.177273
1,ENST00000373020,1149,0.15
2,ENST00000373020,512,0.322727
3,ENST00000373020,689,0.131818
4,ENST00000373020,823,0.009091


In [12]:
k562_rep6_run1_pred.to_csv("../data/aws_predictions/k562_rep6_run1_prediction.csv")

In [13]:
mcf7_rep3_run1_path = "/Users/claudia/Downloads/MCF7_rep3_run1.csv"
mcf7_rep3_run1_pred = prediction(mcf7_rep3_run1_path)
mcf7_rep3_run1_pred.head()

(119481, 76)
119481
Prediction file is of shape: (119481, 3)


Unnamed: 0,transcript,position,score
0,ENST00000373020,1006,0.009091
1,ENST00000373020,1013,0.009091
2,ENST00000373020,1149,0.0
3,ENST00000373020,512,0.018182
4,ENST00000373020,689,0.009091


In [14]:
mcf7_rep3_run1_pred.to_csv("../data/aws_predictions/mcf7_rep3_run1_prediction.csv")

In [15]:
mcf7_rep4_run1_path = "/Users/claudia/Downloads/MCF7_rep4_run1.csv"
mcf7_rep4_run1_pred = prediction(mcf7_rep4_run1_path)
mcf7_rep4_run1_pred.head()

(119481, 76)
119481
Prediction file is of shape: (119481, 3)


Unnamed: 0,transcript,position,score
0,ENST00000373020,1006,0.009091
1,ENST00000373020,1013,0.009091
2,ENST00000373020,1149,0.0
3,ENST00000373020,512,0.018182
4,ENST00000373020,689,0.009091


In [16]:
mcf7_rep4_run1_pred.to_csv("../data/aws_predictions/mcf7_rep4_run1_prediction.csv")

In [13]:
hct116_rep3_run1_path = "Hct116_rep3_run1.csv"
hct116_rep3_run4_path = "Hct116_rep3_run4.csv"
hct116_rep4_run3_path = "Hct116_rep4_run3.csv"

hepG2_rep5_run2_predhct116_rep3_run1_pred = prediction(hct116_rep3_run1_path)
hct116_rep3_run4_pred = prediction(hct116_rep3_run4_path)
hct116_rep4_run3_pred = prediction(hct116_rep4_run3_path)

hct116_rep3_run1_pred.to_csv("../data/aws_predictions/Hct116_rep3_run1_prediction.csv", index = False)
hct116_rep3_run4_pred.to_csv("../data/aws_predictions/Hct116_rep3_run4_prediction.csv", index = False)
hct116_rep4_run3_pred.to_csv("../data/aws_predictions/hct116_rep4_run3_prediction.csv", index = False)

(121838, 76)
121838
Prediction file is of shape: (121838, 3)
(121838, 76)
121838
Prediction file is of shape: (121838, 3)
(121838, 76)
121838
Prediction file is of shape: (121838, 3)


In [17]:
hepG2_rep5_run2_path = "HepG2_rep5_run2.csv"
hepG2_rep6_run1_path = "HepG2_rep6_run1.csv"


hepG2_rep5_run2_pred = prediction(hepG2_rep5_run2_path)
hepG2_rep6_run1_pred = prediction(hepG2_rep6_run1_path)

hepG2_rep5_run2_pred.to_csv("../data/aws_predictions/hepG2_rep5_run2__prediction.csv", index = False)
hepG2_rep6_run1_pred.to_csv("../data/aws_predictions/hepG2_rep6_run1_prediction.csv", index = False)


(119312, 76)
119312
Prediction file is of shape: (119312, 3)
(119283, 76)
119283
Prediction file is of shape: (119283, 3)
