In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

In [None]:
# 클래스별 개수
class_counts = df['clicked'].value_counts()
print("클래스별 개수:\n", class_counts)

# 클래스별 비율
class_ratio = df['clicked'].value_counts(normalize=True)
print("클래스별 비율:\n", class_ratio)

데이터 불균형이 있어서 임계값에 변화를 줘야하는데,  
임계값을 높게 설정하면 정밀도가 높아지고 재현율이 낮아짐  
FP(오탐)을 줄이는게 맞는 방향이라 판단해서 임계값을 바꿀거임

___________________________________________________________________

# 여기가 진짜!

### LGB

In [None]:
df = pd.read_csv("recommend_ml_dataset_v2.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)

X = df.drop(["clicked","is_same_industry"], axis=1)
Y = df["clicked"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)

In [None]:
from lightgbm import LGBMClassifier

# 모델 파라미터 설정
model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    min_child_samples=50,
    reg_alpha=0.1,
    colsample_bytree=1.0,
    subsample=0.8,
    random_state=42,
    class_weight='balanced'
)

# 학습
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc'
)


In [None]:
y_proba = model.predict_proba(X_test)[:, 1]
X_test['y_proba'] = y_proba

train_features = model.booster_.feature_name()
X_test_aligned = X_test[train_features]  

y_pred_binary = model.predict(X_test_aligned)

X_test['y_true'] = y_test

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

# Top-K Precision@K 
def precision_at_k(df, k=5):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        sorted_group = group.sort_values("y_proba", ascending=False).head(k)
        precision = sorted_group["y_true"].sum() / k
        result.append(precision)
    return np.mean(result)

# MRR (Mean Reciprocal Rank)
def mean_reciprocal_rank(df):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        group_sorted = group.sort_values("y_proba", ascending=False).reset_index(drop=True)
        ranks = group_sorted.index[group_sorted["y_true"] == 1]
        if len(ranks) > 0:
            result.append(1.0 / (ranks[0] + 1))
        else:
            result.append(0.0)
    return np.mean(result)

# NDCG@K
def ndcg_at_k(df, k=5):
    scores = []
    for _, group in df.groupby(["userPnl", "asset", "investScore"]):
        true_relevance = group.sort_values("y_proba", ascending=False)["y_true"].values[:k]
        ndcg = ndcg_score([true_relevance], [sorted(true_relevance, reverse=True)])
        scores.append(ndcg)
    return np.mean(scores)


print("Precision@5:", precision_at_k(X_test, k=5))
print("MRR:", mean_reciprocal_rank(X_test))
print("NDCG@5:", ndcg_at_k(X_test, k=5))

In [None]:
import shap
import matplotlib.pyplot as plt

plt.style.use('dark_background')

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_size=(10, 6), show=False)

fig = plt.gcf()
ax = plt.gca()
fig.patch.set_facecolor('black')
ax.set_facecolor('black')

for text in ax.texts:
    text.set_color('white')

ax.title.set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')

plt.tight_layout()
plt.show()


### XGB

In [None]:
df = pd.read_csv("recommend_ml_dataset_v2.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
X = df.drop(["clicked","is_same_industry"], axis=1)
Y = df["clicked"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)



In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
    scale_pos_weight=1,  
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=10,
    colsample_bytree=1.0,
    subsample=0.8,
    early_stopping_rounds=50,
    use_label_encoder=False,
    eval_metric='auc'
)

model_xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)]
)


[0]	validation_0-auc:0.92109
[1]	validation_0-auc:0.92814
[2]	validation_0-auc:0.93060
[3]	validation_0-auc:0.93199
[4]	validation_0-auc:0.93267


Parameters: { "use_label_encoder" } are not used.



[5]	validation_0-auc:0.93294
[6]	validation_0-auc:0.93238
[7]	validation_0-auc:0.93182
[8]	validation_0-auc:0.93254
[9]	validation_0-auc:0.93274
[10]	validation_0-auc:0.93271
[11]	validation_0-auc:0.93261
[12]	validation_0-auc:0.93271
[13]	validation_0-auc:0.93314
[14]	validation_0-auc:0.93416
[15]	validation_0-auc:0.93395
[16]	validation_0-auc:0.93441
[17]	validation_0-auc:0.93480
[18]	validation_0-auc:0.93490
[19]	validation_0-auc:0.93484
[20]	validation_0-auc:0.93523
[21]	validation_0-auc:0.93491
[22]	validation_0-auc:0.93546
[23]	validation_0-auc:0.93527
[24]	validation_0-auc:0.93553
[25]	validation_0-auc:0.93544
[26]	validation_0-auc:0.93529
[27]	validation_0-auc:0.93543
[28]	validation_0-auc:0.93544
[29]	validation_0-auc:0.93557
[30]	validation_0-auc:0.93571
[31]	validation_0-auc:0.93574
[32]	validation_0-auc:0.93585
[33]	validation_0-auc:0.93578
[34]	validation_0-auc:0.93604
[35]	validation_0-auc:0.93599
[36]	validation_0-auc:0.93617
[37]	validation_0-auc:0.93622
[38]	validation

In [None]:
train_features = model_xgb.get_booster().feature_names
X_test_aligned = X_test[train_features]
y_proba = model_xgb.predict_proba(X_test_aligned)[:, 1]
X_test_aligned['y_proba'] = y_proba


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

# Top-K Precision@K 
def precision_at_k(df, k=5):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        sorted_group = group.sort_values("y_proba", ascending=False).head(k)
        precision = sorted_group["y_true"].sum() / k
        result.append(precision)
    return np.mean(result)

# MRR (Mean Reciprocal Rank)
def mean_reciprocal_rank(df):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        group_sorted = group.sort_values("y_proba", ascending=False).reset_index(drop=True)
        ranks = group_sorted.index[group_sorted["y_true"] == 1]
        if len(ranks) > 0:
            result.append(1.0 / (ranks[0] + 1))
        else:
            result.append(0.0)
    return np.mean(result)

# NDCG@K
def ndcg_at_k(df, k=5):
    scores = []
    for _, group in df.groupby(["userPnl", "asset", "investScore"]):
        true_relevance = group.sort_values("y_proba", ascending=False)["y_true"].values[:k]
        ndcg = ndcg_score([true_relevance], [sorted(true_relevance, reverse=True)])
        scores.append(ndcg)
    return np.mean(scores)

print("Precision@5:", precision_at_k(X_test, k=5))
print("MRR:", mean_reciprocal_rank(X_test))
print("NDCG@5:", ndcg_at_k(X_test, k=5))

In [None]:
import shap
import matplotlib.pyplot as plt

plt.style.use('dark_background')

explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, show=False)

fig = plt.gcf()
ax = plt.gca()
fig.patch.set_facecolor('black')
ax.set_facecolor('black')

for text in ax.texts:
    text.set_color('white')

ax.title.set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')

plt.tight_layout()
plt.show()


### RF

In [None]:
df = pd.read_csv("recommend_ml_dataset_v2.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:

X = df.drop(["clicked","is_same_industry"], axis=1)
Y = df["clicked"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=10,
    class_weight='balanced',       
    max_features='sqrt',           
    bootstrap=True,                
    random_state=42,
    n_jobs=-1 # 병렬 처리 
)

model_rf.fit(X_train, y_train)

In [None]:
import shap
import matplotlib.pyplot as plt

plt.style.use('dark_background')

explainer = shap.TreeExplainer(model_rf)
explanation = explainer(X_test)

shap_values = explanation.values

if len(shap_values.shape) == 3:  # (samples, features, classes)
    shap.summary_plot(shap_values[:, :, 1], X_test, show=False)
else:
    shap.summary_plot(shap_values, X_test, show=False)

fig = plt.gcf()
ax = plt.gca()
fig.patch.set_facecolor('black')
ax.set_facecolor('black')

for text in ax.texts:
    text.set_color('white')

ax.title.set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')

plt.tight_layout()
plt.show()


In [None]:
y_proba = model_rf.predict_proba(X_test)[:, 1]
X_test['y_proba'] = y_proba
train_features = X_train.columns.tolist()
X_test_aligned = X_test[train_features]
y_pred_binary = model_rf.predict(X_test_aligned)
X_test['y_true'] = y_test


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

# Top-K Precision@K 
def precision_at_k(df, k=5):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        sorted_group = group.sort_values("y_proba", ascending=False).head(k)
        precision = sorted_group["y_true"].sum() / k
        result.append(precision)
    return np.mean(result)

# MRR (Mean Reciprocal Rank)
def mean_reciprocal_rank(df):
    result = []
    for user, group in df.groupby(["userPnl", "asset", "investScore"]):
        group_sorted = group.sort_values("y_proba", ascending=False).reset_index(drop=True)
        ranks = group_sorted.index[group_sorted["y_true"] == 1]
        if len(ranks) > 0:
            result.append(1.0 / (ranks[0] + 1))
        else:
            result.append(0.0)
    return np.mean(result)

# NDCG@K
def ndcg_at_k(df, k=5):
    scores = []
    for _, group in df.groupby(["userPnl", "asset", "investScore"]):
        true_relevance = group.sort_values("y_proba", ascending=False)["y_true"].values[:k]
        ndcg = ndcg_score([true_relevance], [sorted(true_relevance, reverse=True)])
        scores.append(ndcg)
    return np.mean(scores)

print("Precision@5:", precision_at_k(X_test, k=5))
print("MRR:", mean_reciprocal_rank(X_test))
print("NDCG@5:", ndcg_at_k(X_test, k=5))

# 모델

In [None]:
import pickle

# 모델 저장
with open("lgbm_model2.pkl", "wb") as f:
    pickle.dump(model, f)

# 모델 불러오기
with open("lgbm_model2.pkl", "rb") as f:
    loaded_model = pickle.load(f)


In [None]:
y_test.head(30)

In [None]:
sample_idx = 183366
sample_data = X_test.loc[[sample_idx]]

proba = model.predict_proba(sample_data)[:, 1]  # 클릭 확률값 추출

print("이 샘플의 클릭 확률:", proba[0])


________________________________________________________

# Try

피쳐 엔지니어링 추가

In [None]:
df = pd.read_csv("C:/Users/user/fin_project/db/recommend_ml_dataset.csv")
df.drop(["user_id", "news_id"], axis=1, inplace=True)

In [None]:
df['asset_log'] = np.log1p(df['asset'])
df['is_same_stock_and_industry'] = df['is_same_stock'] * df['is_same_industry']
df['is_same_stock_or_industry'] = df['is_same_industry'] + df['is_same_stock']
df['userPnl_x_is_same_stock'] = df['userPnl'] * df['is_same_stock']
df['userPnl_x_is_same_industry'] = df['userPnl'] * df['is_same_industry']
df['asset_log_x_is_same_stock'] = df['asset_log'] * df['is_same_stock']
df['asset_log_x_is_same_industry'] = df['asset_log'] * df['is_same_industry']
df['investScore_x_is_same_stock'] = df['investScore'] * df['is_same_stock']
df['investScore_x_is_same_industry'] = df['investScore'] * df['is_same_industry']

topic_cols = [f'topic_{i}' for i in range(1, 10)]
def get_main_topic_score(row):
    topic_num = int(row['main_topic'])
    return row[f'topic_{topic_num}']
df['score_of_main_topic'] = df.apply(get_main_topic_score, axis=1)

def get_other_topics_stats(row, stat_type='mean'):
    main_topic_num = int(row['main_topic'])
    other_topic_scores = []
    for col in topic_cols:
        current_topic_num = int(col.split('_')[1])
        if current_topic_num != main_topic_num:
            other_topic_scores.append(row[col])
    if not other_topic_scores: return np.nan
    if stat_type == 'mean': return np.mean(other_topic_scores)
    elif stat_type == 'max': return np.max(other_topic_scores)
    elif stat_type == 'min': return np.min(other_topic_scores)
    elif stat_type == 'std': return np.std(other_topic_scores) if len(other_topic_scores) > 1 else 0.0
    return np.nan

df['avg_score_other_topics'] = df.apply(lambda row: get_other_topics_stats(row, 'mean'), axis=1)
df['max_score_other_topics'] = df.apply(lambda row: get_other_topics_stats(row, 'max'), axis=1)
df['min_score_other_topics'] = df.apply(lambda row: get_other_topics_stats(row, 'min'), axis=1)
df['std_score_other_topics'] = df.apply(lambda row: get_other_topics_stats(row, 'std'), axis=1)
df['diff_main_vs_avg_other_topics'] = df['score_of_main_topic'] - df['avg_score_other_topics']
df['diff_main_vs_max_other_topics'] = df['score_of_main_topic'] - df['max_score_other_topics']


In [None]:
df.info()

In [None]:
from imblearn.over_sampling import SMOTE

X = df.drop(["clicked", "asset"], axis=1)
Y = df["clicked"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42
)

# SMOTE 적용
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("리샘플링 후 클래스 분포:", pd.Series(y_train_res).value_counts())

In [None]:
from lightgbm import LGBMClassifier

# 모델 파라미터 설정
model = LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    early_stopping_rounds=50
)

# 학습
model.fit(
    X_train_res, y_train_res,
    eval_set=[(X_test, y_test)],
    eval_metric='auc'
)


In [None]:
from sklearn.metrics import precision_recall_curve

y_proba = model.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[best_idx]

print(f"최적 임계값: {optimal_threshold:.2f}")
y_pred = (y_proba >= optimal_threshold).astype(int)

# 성능 리포트
print(f"정밀도: {precision_score(y_test, y_pred):.4f}")
print(f"재현율: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("혼동행렬")
print(confusion_matrix(y_test, y_pred))



In [None]:
import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)


In [None]:
import matplotlib.pyplot as plt

importances = model.feature_importances_
feature_names = X_train.columns

indices = importances.argsort()[::-1]

plt.figure(figsize=(8, 10))
plt.title('Feature Importances (LightGBM)')
plt.barh(range(len(importances)), importances[indices], align='center')
plt.yticks(range(len(importances)), [feature_names[i] for i in indices])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.gca().invert_yaxis()  

plt.tight_layout()
plt.show()
