# Random Forest for Prediction

In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [4]:
## load in Train Set
X_train_path = "/Users/claudia/DSA4262-ACMXZ/data/raw_data/X_raw_enc.parquet"
X_train = pd.read_parquet(X_train_path)
y_train_path = "/Users/claudia/DSA4262-ACMXZ/data/raw_data/y_raw.parquet"
y_train = pd.read_parquet(y_train_path)

### convert y_train into int
y_train = y_train.values.ravel()
y_train = y_train.astype(int)

In [5]:
rfe_features = ['std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min',
       'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min',
       'dwelling_time_0_50', 'dwelling_time_0_mean', 'std_0_25', 'std_0_50',
       'std_0_75', 'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25',
       'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 'mean_0_max',
       'dwelling_time_+1_mean', 'std_+1_25', 'std_+1_50', 'mean_+1_25',
       'mean_+1_50', 'mean_+1_75', 'mean_+1_mean', 'mean_+1_min',
       'mean_+1_max', 'relative_position', 'position_1_G', 'position_5_T']

In [6]:
rfc = RandomForestClassifier(random_state = 42, n_estimators = 220, max_features = "sqrt", max_depth = 30,
                        min_samples_split = 2, min_samples_leaf = 1, bootstrap = False)

rfc.fit(X_train[rfe_features], y_train)

# Prediction

In [26]:
def prediction(csv_fpath):
    ## read csv file
    data = pd.read_csv(csv_fpath)
    print(data.shape)

    ## save transcript and position col for concatenation later
    data_id_col = data[["transcript", "position"]]

    ## predict using rfc
    data_pred = rfc.predict_proba(data[rfe_features])[:,1]
    print(len(data_pred))

    ## convert predictions to dataframe
    data_pred_df = pd.DataFrame(data_pred, columns = ['score'])

    ## 
    data_pred_df = pd.concat([data_id_col, data_pred_df], axis = 1)
    print(f"Prediction file is of shape: {data_pred_df.shape}")

    return data_pred_df

In [27]:
k562_rep4_run1_path = "/Users/claudia/Downloads/K562_rep4_run1.csv"
k562_rep4_run1_pred = prediction(k562_rep4_run1_path)

(116779, 76)
116779
Prediction file is of shape: (116779, 3)


In [28]:
k562_rep4_run1_pred.head()

Unnamed: 0,transcript,position,score
0,ENST00000371582,1030,0.0
1,ENST00000371582,105,0.045739
2,ENST00000371582,1123,0.045455
3,ENST00000371582,147,0.027729
4,ENST00000371582,242,0.054545


In [23]:
k562_rep4_run1_pred.to_csv("data/aws_predictions/k562_rep4_run1_prediction.csv")