In [1]:
import pandas as pd

In [2]:
df_original = pd.read_csv("../train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("../sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

# Features from GRU
df = pd.read_csv("../GRU_features.csv").merge(
    df,
    on='id'
)


In [3]:
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
    name = "anti" if anti else "sense"
    df = s.to_frame()
    df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
    for pos in [0, -1]:
        for c in list("AUGC"):
            df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (
                s.str[pos] == c
            )
    df[f"feat_siRNA_{name}_seq_pattern_1"] = s.str.startswith("AA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_2"] = s.str.startswith("GA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_3"] = s.str.startswith("CA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_4"] = s.str.startswith("UA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_5"] = s.str.startswith("UU") & s.str.endswith(
        "AA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_6"] = s.str.startswith("UU") & s.str.endswith(
        "GA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_7"] = s.str.startswith("UU") & s.str.endswith(
        "CA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_8"] = s.str.startswith("UU") & s.str.endswith(
        "UA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_GC_ratio_0"] = (
        s.str.count("G") + s.str.count("C")
    ) / s.str.len()

    df[f"feat_siRNA_{name}_len_range"] = (s.str.len() >= 21) & (s.str.len() <= 25)

    GC_ratio_1 = (s.str.count("G") + s.str.count("C")) / s.str.len()
    df[f"feat_siRNA_{name}_GC_ratio_1"] = (GC_ratio_1 >= 0.31) & (GC_ratio_1 <= 0.58)

    GC_ratio_2 = (s.str[1:7].str.count("G") + s.str[1:7].str.count("C")) / s.str[1:7].str.len()
    df[f"feat_siRNA_{name}_GC_ratio_2"] = (GC_ratio_2 == 0.19)

    GC_ratio_3 = (s.str[7:18].str.count("G") + s.str[7:18].str.count("C")) / s.str[7:18].str.len()
    df[f"feat_siRNA_{name}_GC_ratio_3"] = (GC_ratio_3 == 0.52)

    return df.iloc[:, 1:]

In [4]:
df_publication_id = pd.get_dummies(df.publication_id)
df_publication_id.columns = [
    f"feat_publication_id_{c}" for c in df_publication_id.columns
]
df_gene_target_symbol_name = pd.get_dummies(df.gene_target_symbol_name)
df_gene_target_symbol_name.columns = [
    f"feat_gene_target_symbol_name_{c}" for c in df_gene_target_symbol_name.columns
]
df_gene_target_ncbi_id = pd.get_dummies(df.gene_target_ncbi_id)
df_gene_target_ncbi_id.columns = [
    f"feat_gene_target_ncbi_id_{c}" for c in df_gene_target_ncbi_id.columns
]
df_gene_target_species = pd.get_dummies(df.gene_target_species)
df_gene_target_species.columns = [
    f"feat_gene_target_species_{c}" for c in df_gene_target_species.columns
]
siRNA_duplex_id_values = df.siRNA_duplex_id.str.split("-|\.").str[1].astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
    siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values)
df_cell_line_donor = pd.get_dummies(df.cell_line_donor)
df_cell_line_donor.columns = [
    f"feat_cell_line_donor_{c}" for c in df_cell_line_donor.columns
]
df_cell_line_donor["feat_cell_line_donor_hepatocytes"] = (
    (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
)
df_cell_line_donor["feat_cell_line_donor_cells"] = (
    df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")
)
df_siRNA_concentration = df.siRNA_concentration.to_frame()
df_Transfection_method = pd.get_dummies(df.Transfection_method)
df_Transfection_method.columns = [
    f"feat_Transfection_method_{c}" for c in df_Transfection_method.columns
]
df_Duration_after_transfection_h = pd.get_dummies(df.Duration_after_transfection_h)
df_Duration_after_transfection_h.columns = [
    f"feat_Duration_after_transfection_h_{c}"
    for c in df_Duration_after_transfection_h.columns
]

df_GRU_pred = df[['GRU_predict']]
df_GRU_feature = df[ df.columns[['GRU_feature_' in c for c in df.columns]] ]
important_GRU_features = [
    'GRU_feature_11','GRU_feature_18','GRU_feature_523','GRU_feature_679','GRU_feature_689'
]
df_GRU_feature = df_GRU_feature[important_GRU_features]

feats = pd.concat(
    [
        df_publication_id,
        df_gene_target_symbol_name,
        df_gene_target_ncbi_id,
        df_gene_target_species,
        df_siRNA_duplex_id,
        df_cell_line_donor,
        df_siRNA_concentration,
        df_Transfection_method,
        df_Duration_after_transfection_h,
        siRNA_feat_builder(df.siRNA_sense_seq, False),
        siRNA_feat_builder(df.siRNA_antisense_seq, True),
        df_GRU_pred,
        # df_GRU_feature,
        df.iloc[:, -1].to_frame(),
    ],
    axis=1,
)

  (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
  df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")


In [5]:
feats.shape


(30656, 212)

In [6]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

features = feats.iloc[:n_original, :-1]
targets = feats.iloc[:n_original, -1]
X_train, X_test, y_train, y_test = train_test_split(
    features,
    targets,
    test_size=0.2,
    random_state=42,
)


In [7]:
from sklearn.metrics import mean_absolute_error, make_scorer

# calculate_validation_score for GridSearchCV
def calculate_validation_score(y_true, y_pred, threshold=30):
    # y_pred = preds
    # y_true = data.get_label()
    mae = np.mean(np.abs(y_true - y_pred))
    # if mae < 0: mae = 0
    # elif mae >100: mae = 100

    y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
    y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)

    mask = (y_pred >= 0) & (y_pred <= threshold)
    range_mae = (
        mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
    )
    # if range_mae < 0: range_mae = 0
    # elif range_mae >100: range_mae = 100

    # precision = precision_score(y_true_binary, y_pred_binary, average="binary")
    # recall = recall_score(y_true_binary, y_pred_binary, average="binary")

    if np.sum(y_pred_binary) > 0:
        precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
    else:
        precision = 0
    if np.sum(y_true_binary) > 0:
        recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
    else:
        recall = 0

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
    return score

custom_scorer = make_scorer(calculate_validation_score, greater_is_better=True)


In [8]:
# calculate_metrics for lightgbm training
def calculate_validation_score_for_training(preds, data, threshold=30):
    y_pred = preds
    y_true = data.get_label()
    mae = np.mean(np.abs(y_true - y_pred))
    # if mae < 0: mae = 0
    # elif mae >100: mae = 100

    y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
    y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)

    mask = (y_pred >= 0) & (y_pred <= threshold)
    range_mae = (
        mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
    )
    # if range_mae < 0: range_mae = 0
    # elif range_mae >100: range_mae = 100

    # precision = precision_score(y_true_binary, y_pred_binary, average="binary")
    # recall = recall_score(y_true_binary, y_pred_binary, average="binary")

    if np.sum(y_pred_binary) > 0:
        precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
    else:
        precision = 0
    if np.sum(y_true_binary) > 0:
        recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
    else:
        recall = 0

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
    return "custom_score", score, True


In [9]:
#######
# # For the training data, double the size of the observations with y < 30:
# def Below_30_double(X, y):
#     idx = y < 30
#     y = pd.concat([y, y[idx]])
#     X = pd.concat([X, X[idx]])
#     return X,y
# X_train, y_train = Below_30_double(X_train, y_train)
#######

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


# 定义一个回调函数来打印验证集的结果
def print_validation_result(env):
    result = env.evaluation_result_list[-1]
    print(f"[{env.iteration}] {result[1]}'s {result[0]}: {result[2]}")

'''
# Grid Search for Hyperparameter Tuning (Optional)
param_grid = {
    'max_depth': [7, 9, 11],
    'learning_rate': [0.01, 0.02],
    'num_leaves': [31, 63, 127],
    'feature_fraction': [0.8, 0.9],
    'bagging_fraction': [0.8, 0.9],
    'bagging_freq': [0, 5, 10],
    'n_estimators': [15000, 20000],
    'min_child_samples': [20, 30, 50],
}

gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression')
grid = GridSearchCV(gbm, param_grid, cv=3, scoring=custom_scorer, verbose=1)
grid.fit(features, targets)

print(f'Best parameters found by grid search are: {grid.best_params_}')
print(f'Best estimator found by grid search are: {grid.best_estimator_}')

# Train with best parameters
best_params = grid.best_params_
best_estimator = grid.best_estimator_
'''

"\n# Grid Search for Hyperparameter Tuning (Optional)\nparam_grid = {\n    'max_depth': [7, 9, 11],\n    'learning_rate': [0.01, 0.02],\n    'num_leaves': [31, 63, 127],\n    'feature_fraction': [0.8, 0.9],\n    'bagging_fraction': [0.8, 0.9],\n    'bagging_freq': [0, 5, 10],\n    'n_estimators': [15000, 20000],\n    'min_child_samples': [20, 30, 50],\n}\n\ngbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression')\ngrid = GridSearchCV(gbm, param_grid, cv=3, scoring=custom_scorer, verbose=1)\ngrid.fit(features, targets)\n\nprint(f'Best parameters found by grid search are: {grid.best_params_}')\nprint(f'Best estimator found by grid search are: {grid.best_estimator_}')\n\n# Train with best parameters\nbest_params = grid.best_params_\nbest_estimator = grid.best_estimator_\n"

In [10]:
# saved best parameters to improve testing efficiency
best_params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "None",
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "feature_fraction": 0.9,
    "learning_rate": 0.02,
    "max_depth": 9,
    "num_leaves": 127,
    "min_child_samples": 20
}
best_estimator = {
    "n_estimators": 15000,
}

cv_gbm = lgb.train(
    best_params,
    train_data,
    num_boost_round=best_estimator["n_estimators"],
    feval=calculate_validation_score_for_training,
    valid_sets=[train_data, test_data],
    callbacks=[print_validation_result],
)

# Final Evaluation
y_pred_best = cv_gbm.predict(X_test, num_iteration=cv_gbm.best_iteration)
score_best = calculate_validation_score(y_test, y_pred_best)
print(f'Validation score after tuning: {score_best}')

# best score is 0.8288
# Validation score after tuning: 0.8243341724610049


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 20625, number of used features: 201
[LightGBM] [Info] Start training from score 54.398538
[0] custom_score's valid_1: 0.35291847642577057
[1] custom_score's valid_1: 0.3549684907197581
[2] custom_score's valid_1: 0.35697833041305516
[3] custom_score's valid_1: 0.3589322470027527
[4] custom_score's valid_1: 0.36082672676606853
[5] custom_score's valid_1: 0.36267669002037567
[6] custom_score's valid_1: 0.3644913352687631
[7] custom_score's valid_1: 0.36622499166914246
[8] custom_score's valid_1: 0.3679448879698103
[9] custom_score's valid_1: 0.3696174048548925
[10] custom_score's valid_1: 0.3712465984982394
[11] custom_score's valid_1: 0.37282439855250726
[12] custom_score's v

In [11]:
all_data = lgb.Dataset(features, label=targets)

best_gbm = lgb.train(
    best_params,
    all_data,
    num_boost_round=8700
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 981
[LightGBM] [Info] Number of data points in the train set: 25782, number of used features: 203
[LightGBM] [Info] Start training from score 54.437405


In [12]:
eval_data = feats.iloc[n_original:, :-1]
y_pred = best_gbm.predict(eval_data)

In [13]:
df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("../submission.csv", index=False)