In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm -rf ~/.cache/matplotlib

path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
import matplotlib.font_manager as fm
font10 = fm.FontProperties(fname=path, size=10)

plt.rc('font', family='NanumBarunGothic')

In [None]:
# parquet -> 대용량 데이터셋을 다루기 위한 확장자.
#csv -> 24만건 => 1.신용정보, .... 8.
#열 갯수 -> 700

In [None]:
train_path = "/content/drive/MyDrive/card_train.csv"
test_path = "/content/drive/MyDrive/card_test.csv"

In [None]:
train_df = pd.read_csv(train_path, encoding = 'utf-8-sig')

## 키워드별 Feature Importance 분석

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import numpy as np


# ------------------------------------------------


# 1. Load dataset
train_df = pd.read_csv(train_path, encoding = 'utf-8-sig')
y = train_df["Segment"]
X_full = train_df.drop(columns=["ID", "Unnamed: 0.1", "Segment.1", "Segment"], errors="ignore")

# 2. Feature selection by keyword + optional time_filters
def select_features_by_keywords(df, keywords, time_filters=None):
    if time_filters and len(time_filters) > 0:
        return df[[col for col in df.columns if any(k in col for k in keywords) and any(t in col for t in time_filters)]]
    else:
        return df[[col for col in df.columns if any(k in col for k in keywords)]]

# 3. Train & evaluate
def train_model_with_keywords(X, y):
    X = X.fillna("missing")
    for col in X.select_dtypes(include="object").columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LGBMClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average="macro")
    return model, score

# 4. Importance plot
def plot_feature_importance(model, feature_names, top_n=30):
    importances = model.feature_importances_
    indices = np.argsort(importances)[-top_n:]
    plt.figure(figsize=(8, 10))
    sns.barplot(x=importances[indices], y=[feature_names[i] for i in indices])
    plt.title("Top Feature Importances")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

# 5. 실행: 키워드와 시간 필터 조건 지정 ----------------------------------------------------
keywords = ["이용", "금액", "연체"]
time_filters = []  # []로 두면 기간 조건 X

X_selected = select_features_by_keywords(X_full, keywords, time_filters=time_filters)
model, score = train_model_with_keywords(X_selected, y)
print(f"Macro F1 score: {score:.4f}")
plot_feature_importance(model, X_selected.columns)


## 전체 Feature Importance 상위 50개 분석

In [None]:
# 패키지 설치
!pip install lightgbm -q
!apt-get -qq install -y fonts-nanum

# 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import re # Import the re module for regular expressions

# 한글 폰트 설정
font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = fm.FontProperties(fname=font_path)
plt.rcParams["font.family"] = fontprop.get_name()
plt.rcParams["axes.unicode_minus"] = False

# 데이터 불러오기
train_df = pd.read_csv("/content/drive/MyDrive/card_train.csv")
y = train_df["Segment"]
X_full = train_df.drop(columns=["ID", "Unnamed: 0.1", "Segment.1", "Segment"], errors="ignore")

# Function to sanitize column names
def sanitize_column_names(df):
    cols = df.columns
    new_cols = []
    for col in cols:
        # Replace special characters and spaces with underscores
        new_col = re.sub(r'[^\w_]+', '_', col)
        new_cols.append(new_col)
    df.columns = new_cols
    return df

# Sanitize column names
X_full = sanitize_column_names(X_full)


# 전체 피처에서 중요도 기반 상위 N개 추출
def get_top_features(X, y, top_n=50):
    X = X.fillna("missing")
    # Label Encoding for object type columns
    for col in X.select_dtypes(include="object").columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LGBMClassifier(random_state=42)
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    top_indices = np.argsort(importances)[-top_n:]
    return X.iloc[:, top_indices], y, X.columns[top_indices]

# 모델 학습 및 평가
def train_model(X, y):
    X = X.fillna("missing")
    # Label Encoding for object type columns
    for col in X.select_dtypes(include="object").columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LGBMClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average="macro")
    return model, score

# 중요도 시각화
def plot_feature_importance(model, feature_names, top_n=30):
    importances = model.feature_importances_
    indices = np.argsort(importances)[-top_n:]
    # Need to map the sanitized names back to original or handle visualization differently
    # For now, plotting with sanitized names
    plt.figure(figsize=(8, 10))
    sns.barplot(x=importances[indices], y=[feature_names[i] for i in indices])
    plt.title("Top Feature Importances", fontproperties=fontprop)
    plt.xlabel("Importance", fontproperties=fontprop)
    plt.ylabel("Feature", fontproperties=fontprop)
    plt.tight_layout()
    plt.show()

# 실행
# Use the sanitized X_full
X_top50, y, top_columns = get_top_features(X_full, y, top_n=50)
model_top50, score_top50 = train_model(X_top50, y)

print(f"Macro F1 score with TOP 50 features: {score_top50:.4f}")
# Pass the top_columns with sanitized names to the plotting function
plot_feature_importance(model_top50, top_columns)