In [1]:
import pandas as pd
import numpy as np


In [2]:
df_original = pd.read_csv("../train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("../sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

# Features from GRU
df = pd.read_csv("../GRU_features_predict_only.csv").merge(
    df,
    on='id'
)

# Features from other pretrained model
df = pd.read_csv("../pretrained_feature_predict.csv", index_col=0).merge(
    df,
    on='id'
)


In [3]:
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
    name = "anti" if anti else "sense"
    df = s.to_frame()
    df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
    for pos in [0, -1]:
        for c in list("AUGC"):
            df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (
                s.str[pos] == c
            )
    df[f"feat_siRNA_{name}_seq_pattern_1"] = s.str.startswith("AA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_2"] = s.str.startswith("GA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_3"] = s.str.startswith("CA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_4"] = s.str.startswith("UA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_5"] = s.str.startswith("UU") & s.str.endswith(
        "AA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_6"] = s.str.startswith("UU") & s.str.endswith(
        "GA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_7"] = s.str.startswith("UU") & s.str.endswith(
        "CA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_8"] = s.str.startswith("UU") & s.str.endswith(
        "UA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_GC_ratio_0"] = (
        s.str.count("G") + s.str.count("C")
    ) / s.str.len()

    df[f"feat_siRNA_{name}_len_range"] = (s.str.len() >= 21) & (s.str.len() <= 25)

    GC_ratio_1 = (s.str.count("G") + s.str.count("C")) / s.str.len()
    df[f"feat_siRNA_{name}_GC_ratio_1"] = (GC_ratio_1 >= 0.31) & (GC_ratio_1 <= 0.58)

    GC_ratio_2 = (s.str[1:7].str.count("G") + s.str[1:7].str.count("C")) / s.str[1:7].str.len()
    df[f"feat_siRNA_{name}_GC_ratio_2"] = (GC_ratio_2 == 0.19)

    GC_ratio_3 = (s.str[7:18].str.count("G") + s.str[7:18].str.count("C")) / s.str[7:18].str.len()
    df[f"feat_siRNA_{name}_GC_ratio_3"] = (GC_ratio_3 == 0.52)

    return df.iloc[:, 1:]

In [4]:
df_publication_id = pd.get_dummies(df.publication_id)
df_publication_id.columns = [
    f"feat_publication_id_{c}" for c in df_publication_id.columns
]
df_gene_target_symbol_name = pd.get_dummies(df.gene_target_symbol_name)
df_gene_target_symbol_name.columns = [
    f"feat_gene_target_symbol_name_{c}" for c in df_gene_target_symbol_name.columns
]
df_gene_target_ncbi_id = pd.get_dummies(df.gene_target_ncbi_id)
df_gene_target_ncbi_id.columns = [
    f"feat_gene_target_ncbi_id_{c}" for c in df_gene_target_ncbi_id.columns
]
df_gene_target_species = pd.get_dummies(df.gene_target_species)
df_gene_target_species.columns = [
    f"feat_gene_target_species_{c}" for c in df_gene_target_species.columns
]
siRNA_duplex_id_values = df.siRNA_duplex_id.str.split("-|\.").str[1].astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
    siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values)
df_cell_line_donor = pd.get_dummies(df.cell_line_donor)
df_cell_line_donor.columns = [
    f"feat_cell_line_donor_{c}" for c in df_cell_line_donor.columns
]
df_cell_line_donor["feat_cell_line_donor_hepatocytes"] = (
    (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
)
df_cell_line_donor["feat_cell_line_donor_cells"] = (
    df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")
)
df_siRNA_concentration = df.siRNA_concentration.to_frame()
df_Transfection_method = pd.get_dummies(df.Transfection_method)
df_Transfection_method.columns = [
    f"feat_Transfection_method_{c}" for c in df_Transfection_method.columns
]
df_Duration_after_transfection_h = pd.get_dummies(df.Duration_after_transfection_h)
df_Duration_after_transfection_h.columns = [
    f"feat_Duration_after_transfection_h_{c}"
    for c in df_Duration_after_transfection_h.columns
]

df_GRU_pred = df[['GRU_predict']]
df_pretrained_pred = df[['Pretrained_feature_predict']]
# df_RiNALMo = df[[x for x in df.columns if 'RiNALMo_feature' in x]]
# df_mRNAFM = df[[x for x in df.columns if 'mRNAFM_feature' in x]]

feats = pd.concat(
    [
        df_publication_id,
        df_gene_target_symbol_name,
        df_gene_target_ncbi_id,
        df_gene_target_species,
        df_siRNA_duplex_id,
        df_cell_line_donor,
        df_siRNA_concentration,
        df_Transfection_method,
        df_Duration_after_transfection_h,
        siRNA_feat_builder(df.siRNA_sense_seq, False),
        siRNA_feat_builder(df.siRNA_antisense_seq, True),
        df_GRU_pred,
        # df_RiNALMo,
        # df_mRNAFM,
        df_pretrained_pred,
        df.iloc[:, -1].to_frame(),
    ],
    axis=1,
)

  (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
  df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")


In [5]:
feats.shape


(30656, 213)

In [6]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

features = feats.iloc[:n_original, :-1]
targets = feats.iloc[:n_original, -1]
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    test_size=0.2,
    random_state=42,
)


In [None]:
####################################
# Variable importance
####################################
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

RF_model = RandomForestRegressor(n_estimators=300)
RF_model.fit(X_train, y_train)
VI = permutation_importance(
    RF_model,
    X_test,
    y_test,
    n_repeats=10
)
VI_threshold = sorted(VI.importances_mean, reverse=True)[210]
selected_variables = X_train.columns[VI.importances_mean>VI_threshold]
print(len(selected_variables))


In [None]:

set( X_train.columns ) - set(selected_variables)


In [None]:
print(np.sum(y_train>100))
y_train[y_train>100] = 100
y_train[y_train<0] = 0


In [None]:
from siRNA_fun import calculate_validation_score, custom_scorer, calculate_validation_score_for_training, Get_sample_weight

train_data = lgb.Dataset(X_train[selected_variables], label=y_train, weight=Get_sample_weight(y_train))
test_data = lgb.Dataset(X_test[selected_variables], label=y_test, reference=train_data, weight=Get_sample_weight(y_test))


In [None]:

# 定义一个回调函数来打印验证集的结果
def print_validation_result(env):
    result = env.evaluation_result_list[-1]
    print(f"[{env.iteration}] {result[1]}'s {result[0]}: {result[2]}")

# saved best parameters to improve testing efficiency
best_params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "None",
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "feature_fraction": 0.9,
    "learning_rate": 0.02,
    "max_depth": 9,
    "num_leaves": 127,
    "min_child_samples": 20,
}


best_gbm = lgb.train(
    best_params,
    train_data,
    num_boost_round=30000,
    feval=calculate_validation_score_for_training,
    valid_sets=[train_data, test_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=5000),
        print_validation_result
    ]
)

# Final Evaluation
y_pred_best = best_gbm.predict(X_test[selected_variables], num_iteration=best_gbm.best_iteration)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f'Validation RMSE after tuning: {rmse_best}')

# Validation RMSE after tuning: 18.90719098159109
# Validation RMSE after tuning: 17.596051101784145


In [None]:
X_all = features[selected_variables]
y_all = targets
y_all[y_all>100] = 100
y_all[y_all<0] = 0

all_data = lgb.Dataset(X_all, label=y_all, weight=Get_sample_weight(y_all))
best_model = lgb.train(
    best_params,
    all_data,
    num_boost_round=4243
)


In [None]:
eval_data = feats.iloc[n_original:, :-1]
y_pred = best_gbm.predict(eval_data[selected_variables])
y_pred[y_pred>100] = 100
y_pred[y_pred<0] = 0

df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("../submission.csv", index=False)




In [None]:
from sklearn.metrics import f1_score

f1_score(y_test<30, y_pred_best<30)

