In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
csv_files = glob.glob("./*.csv")

In [3]:
print(csv_files)

['./submission_baseline_diff.csv', './submission_baseline_qwen_ch27244.csv', './submission_electra_qwen_corpus.csv', './submission_aug.csv', './submission_electra_qwen_v3.csv', './submission_electra_qwen_paragraph.csv']


In [None]:
# 모든 파일 예측값 
dfs = [pd.read_csv(f) for f in csv_files]
merged = dfs[0][['ID']].copy()
for i, df in enumerate(dfs):
    merged[f'pred_{i}'] = df['generated'] 


In [129]:
merged

Unnamed: 0,ID,pred_0,pred_1,pred_2
0,TEST_0000,0.000006,0.982538,0.867556
1,TEST_0001,0.998600,0.944010,0.999288
2,TEST_0002,0.000005,0.000183,0.003713
3,TEST_0003,0.999969,0.071127,0.998774
4,TEST_0004,0.000015,0.248156,0.999053
...,...,...,...,...
1957,TEST_1957,0.000006,0.001073,0.991393
1958,TEST_1958,0.000008,0.036138,0.993752
1959,TEST_1959,0.000006,0.000368,0.024655
1960,TEST_1960,0.000008,0.000229,0.009800


In [None]:
# hard voting
threshold=0.5
pred_classes = (merged.filter(like='pred_') > threshold).astype(int)
merged['hard_vote'] = pred_classes.mode(axis=1)[0]


In [131]:
merged

Unnamed: 0,ID,pred_0,pred_1,pred_2,hard_vote
0,TEST_0000,0.000006,0.982538,0.867556,1
1,TEST_0001,0.998600,0.944010,0.999288,1
2,TEST_0002,0.000005,0.000183,0.003713,0
3,TEST_0003,0.999969,0.071127,0.998774,1
4,TEST_0004,0.000015,0.248156,0.999053,0
...,...,...,...,...,...
1957,TEST_1957,0.000006,0.001073,0.991393,0
1958,TEST_1958,0.000008,0.036138,0.993752,0
1959,TEST_1959,0.000006,0.000368,0.024655,0
1960,TEST_1960,0.000008,0.000229,0.009800,0


In [132]:
merged["hard_vote"].value_counts()

hard_vote
0    1191
1     771
Name: count, dtype: int64

In [None]:
# soft voting
# 평균
def mean_wo_amb_by_hardvote(row, hard_vote, low=0.2, high=0.6):
    """
    row: 확률 예측값 series (pred_로 시작하는 컬럼)
    hard_vote: 해당 row의 하드보팅 결과
    """
    preds = row.values
    if hard_vote == 0.0:
        filtered = preds[preds <= low]
    else:
        filtered = preds[preds >= high]
    if len(filtered) == 1.0:
        return np.nan
    return filtered.mean()

pred_cols = merged.filter(like='pred_').columns
merged['mean_wo_amb'] = merged.apply(
    lambda row: mean_wo_amb_by_hardvote(row[pred_cols], row['hard_vote'], low=0.2, high=0.6),
    axis=1
)


  return filtered.mean()
  ret = ret / rcount


In [None]:
# soft voting
# weights: 각 예측값에 대한 가중치 부여
def weighted_mean_wo_amb_by_hardvote(row, hard_vote, weights, low=0.2, high=0.6):
    preds = row.values
    weights = np.array(weights)
    if hard_vote == 0:
        mask = preds <= low
    else:
        mask = preds >= high
    filtered_preds = preds[mask]
    filtered_weights = weights[mask]
    if len(filtered_preds) == 1:
        return np.nan
    if filtered_weights.sum() == 0:
        return np.nan
    return np.average(filtered_preds, weights=filtered_weights)


In [None]:
pred_cols = merged.filter(like='pred_').columns
weights = [0.2, 0.3, 0.5] 

merged['weighted_mean_wo_amb'] = merged.apply(
    lambda row: weighted_mean_wo_amb_by_hardvote(row[pred_cols], row['hard_vote'], weights, low=0.2, high=0.6),
    axis=1
)


In [135]:
merged

Unnamed: 0,ID,pred_0,pred_1,pred_2,hard_vote,weighted_mean_wo_amb
0,TEST_0000,0.000006,0.982538,0.867556,1,0.910674
1,TEST_0001,0.998600,0.944010,0.999288,1,0.982567
2,TEST_0002,0.000005,0.000183,0.003713,0,0.001913
3,TEST_0003,0.999969,0.071127,0.998774,1,0.999115
4,TEST_0004,0.000015,0.248156,0.999053,0,
...,...,...,...,...,...,...
1957,TEST_1957,0.000006,0.001073,0.991393,0,0.000646
1958,TEST_1958,0.000008,0.036138,0.993752,0,0.021686
1959,TEST_1959,0.000006,0.000368,0.024655,0,0.012439
1960,TEST_1960,0.000008,0.000229,0.009800,0,0.004970


In [136]:
merged.isna().sum()

ID                       0
pred_0                   0
pred_1                   0
pred_2                   0
hard_vote                0
weighted_mean_wo_amb    75
dtype: int64

In [None]:
def fill_mean_wo_amb(row):
    if np.isnan(row['weighted_mean_wo_amb']):
        pred_cols = [col for col in row.index if col.startswith('pred_')]
        preds = [row[col] for col in pred_cols]
        # 예측값 중 최대값 부여
        if row['hard_vote'] == 1.0:
            return np.max(preds)
        else:
        # 예측값 중 최소값 부여
            return np.min(preds)
    return row['weighted_mean_wo_amb']

merged['weighted_mean_wo_amb'] = merged.apply(fill_mean_wo_amb, axis=1)


In [138]:
merged.isna().sum()

ID                      0
pred_0                  0
pred_1                  0
pred_2                  0
hard_vote               0
weighted_mean_wo_amb    0
dtype: int64

In [139]:
merged

Unnamed: 0,ID,pred_0,pred_1,pred_2,hard_vote,weighted_mean_wo_amb
0,TEST_0000,0.000006,0.982538,0.867556,1,0.910674
1,TEST_0001,0.998600,0.944010,0.999288,1,0.982567
2,TEST_0002,0.000005,0.000183,0.003713,0,0.001913
3,TEST_0003,0.999969,0.071127,0.998774,1,0.999115
4,TEST_0004,0.000015,0.248156,0.999053,0,0.000015
...,...,...,...,...,...,...
1957,TEST_1957,0.000006,0.001073,0.991393,0,0.000646
1958,TEST_1958,0.000008,0.036138,0.993752,0,0.021686
1959,TEST_1959,0.000006,0.000368,0.024655,0,0.012439
1960,TEST_1960,0.000008,0.000229,0.009800,0,0.004970


In [None]:
merged[['ID', 'weighted_mean_wo_amb']].rename(columns={'weighted_mean_wo_amb':'generated'}).to_csv('submission.csv', index=False)