In [1]:
import pandas as pd

In [2]:
df_original = pd.read_csv("../train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("../sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

In [3]:
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
    name = "anti" if anti else "sense"
    df = s.to_frame()
    df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
    for pos in [0, -1]:
        for c in list("AUGC"):
            df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (
                s.str[pos] == c
            )
    df[f"feat_siRNA_{name}_seq_pattern_1"] = s.str.startswith("AA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_2"] = s.str.startswith("GA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_3"] = s.str.startswith("CA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_4"] = s.str.startswith("UA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_5"] = s.str.startswith("UU") & s.str.endswith(
        "AA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_6"] = s.str.startswith("UU") & s.str.endswith(
        "GA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_7"] = s.str.startswith("UU") & s.str.endswith(
        "CA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_8"] = s.str.startswith("UU") & s.str.endswith(
        "UA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_GC_frac"] = (
        s.str.count("G") + s.str.count("C")
    ) / s.str.len()
    return df.iloc[:, 1:]

In [4]:
df_publication_id = pd.get_dummies(df.publication_id)
df_publication_id.columns = [
    f"feat_publication_id_{c}" for c in df_publication_id.columns
]
df_gene_target_symbol_name = pd.get_dummies(df.gene_target_symbol_name)
df_gene_target_symbol_name.columns = [
    f"feat_gene_target_symbol_name_{c}" for c in df_gene_target_symbol_name.columns
]
df_gene_target_ncbi_id = pd.get_dummies(df.gene_target_ncbi_id)
df_gene_target_ncbi_id.columns = [
    f"feat_gene_target_ncbi_id_{c}" for c in df_gene_target_ncbi_id.columns
]
df_gene_target_species = pd.get_dummies(df.gene_target_species)
df_gene_target_species.columns = [
    f"feat_gene_target_species_{c}" for c in df_gene_target_species.columns
]
siRNA_duplex_id_values = df.siRNA_duplex_id.str.split("-|\.").str[1].astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
    siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values)
df_cell_line_donor = pd.get_dummies(df.cell_line_donor)
df_cell_line_donor.columns = [
    f"feat_cell_line_donor_{c}" for c in df_cell_line_donor.columns
]
df_cell_line_donor["feat_cell_line_donor_hepatocytes"] = (
    (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
)
df_cell_line_donor["feat_cell_line_donor_cells"] = (
    df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")
)
df_siRNA_concentration = df.siRNA_concentration.to_frame()
df_Transfection_method = pd.get_dummies(df.Transfection_method)
df_Transfection_method.columns = [
    f"feat_Transfection_method_{c}" for c in df_Transfection_method.columns
]
df_Duration_after_transfection_h = pd.get_dummies(df.Duration_after_transfection_h)
df_Duration_after_transfection_h.columns = [
    f"feat_Duration_after_transfection_h_{c}"
    for c in df_Duration_after_transfection_h.columns
]
feats = pd.concat(
    [
        df_publication_id,
        df_gene_target_symbol_name,
        df_gene_target_ncbi_id,
        df_gene_target_species,
        df_siRNA_duplex_id,
        df_cell_line_donor,
        df_siRNA_concentration,
        df_Transfection_method,
        df_Duration_after_transfection_h,
        siRNA_feat_builder(df.siRNA_sense_seq, False),
        siRNA_feat_builder(df.siRNA_antisense_seq, True),
        df.iloc[:, -1].to_frame(),
    ],
    axis=1,
)

In [5]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

features = feats.iloc[:n_original, :-1]
targets = feats.iloc[:n_original, -1]
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    test_size=0.2,
    random_state=42,
)

In [6]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


# 定义一个回调函数来打印验证集的结果
def print_validation_result(env):
    result = env.evaluation_result_list[-1]
    print(f"[{env.iteration}] {result[1]}'s {result[0]}: {result[2]}")

# Grid Search for Hyperparameter Tuning (Optional)
param_grid = {
    'max_depth': [7, 9],
    'learning_rate': [0.02],
    'num_leaves': [31, 63, 127],
    'feature_fraction': [0.8, 0.9],
    'bagging_fraction': [0.8, 0.9],
    'bagging_freq': [0, 5, 10],
}

gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', n_estimators=15000)

grid = GridSearchCV(gbm, param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
grid.fit(features, targets)

print(f'Best parameters found by grid search are: {grid.best_params_}')

# Train with best parameters
best_params = grid.best_params_
best_gbm = lgb.train(
    best_params,
    train_data,
    num_boost_round=15000,
    valid_sets=[train_data, test_data],
    callbacks=[print_validation_result],
)

# Final Evaluation
y_pred_best = best_gbm.predict(X_test, num_iteration=best_gbm.best_iteration)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f'Validation RMSE after tuning: {rmse_best}')

[0] rmse's valid_1: 35.19172086058371
[1] rmse's valid_1: 34.88777104719689
[2] rmse's valid_1: 34.62472822986162
[3] rmse's valid_1: 34.32304853020071
[4] rmse's valid_1: 34.07950612170202
[5] rmse's valid_1: 33.7942527487897
[6] rmse's valid_1: 33.51778754486036
[7] rmse's valid_1: 33.24611196972437
[8] rmse's valid_1: 33.02934837385515
[9] rmse's valid_1: 32.8029146151526
[10] rmse's valid_1: 32.61004311396223
[11] rmse's valid_1: 32.36527109860499
[12] rmse's valid_1: 32.12750881550474
[13] rmse's valid_1: 31.949784066939763
[14] rmse's valid_1: 31.729056760610234
[15] rmse's valid_1: 31.491444030082395
[16] rmse's valid_1: 31.306958650654675
[17] rmse's valid_1: 31.076669135348524
[18] rmse's valid_1: 30.8615764522798
[19] rmse's valid_1: 30.65232486268304
[20] rmse's valid_1: 30.44719524455198
[21] rmse's valid_1: 30.250096371036925
[22] rmse's valid_1: 30.058469630032004
[23] rmse's valid_1: 29.870190276597533
[24] rmse's valid_1: 29.688343251941173
[25] rmse's valid_1: 29.53659

In [7]:
eval_data = feats.iloc[n_original:, :-1]
y_pred = best_gbm.predict(eval_data)

In [9]:
df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("../submission.csv", index=False)