In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("new_df.csv", encoding='utf-8')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 텍스트 데이터와 라벨 분리
texts = df['preprocessed_text']  # 'new_labeled_data' 대신 'df' 사용
labels = df['label']  # 동일하게 'df'에서 라벨 가져오기

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(max_features=30)  # 상위 30개 단어만 사용
vectored_df = vectorizer.fit_transform(texts)

# 희소 행렬 -> 밀집 행렬 변환
dense_df = vectored_df.todense()

# 특징 이름 가져오기
feature_names = vectorizer.get_feature_names_out()

# DataFrame으로 변환
df_tfidf = pd.DataFrame(dense_df, columns=feature_names)

print("TF-IDF 벡터화 결과:")
print(df_tfidf)


TF-IDF 벡터화 결과:
        건강        검정   경보       경찰청   관리   그늘   금지        되다  물놀이       바라다  \
0      0.0  0.000000  0.0  0.707698  0.0  0.0  0.0  0.000000  0.0  0.000000   
1      0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.313276   
2      0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.855446  0.0  0.517892   
3      0.0  0.627727  0.0  0.550896  0.0  0.0  0.0  0.000000  0.0  0.000000   
4      0.0  0.849887  0.0  0.372932  0.0  0.0  0.0  0.000000  0.0  0.000000   
...    ...       ...  ...       ...  ...  ...  ...       ...  ...       ...   
21835  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.171697   
21836  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.654018   
21837  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.733854  0.0  0.444279   
21838  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.491296   
21839  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.654018   

       ...   준수        지역   착용      

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# 라벨 변환
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['label'])  # 라벨을 0부터 시작하는 정수로 변환

# Train/Validation/Test 데이터 분리
X = df_tfidf
y = y_encoded  # 변환된 라벨 사용

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Train/Validation/Test 데이터 분리 완료")
print(f"훈련 데이터 크기: {X_train.shape}, {y_train.shape}")
print(f"검증 데이터 크기: {X_valid.shape}, {y_valid.shape}")
print(f"테스트 데이터 크기: {X_test.shape}, {y_test.shape}")


Train/Validation/Test 데이터 분리 완료
훈련 데이터 크기: (13104, 30), (13104,)
검증 데이터 크기: (4368, 30), (4368,)
테스트 데이터 크기: (4368, 30), (4368,)


In [6]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# 기본 XGBoost 모델 생성 및 학습
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# 학습 데이터 평가
y_train_pred = xgb_model.predict(X_train)
print("Train Set Performance:")
print(classification_report(y_train, y_train_pred))

# 검증 데이터 평가
y_valid_pred = xgb_model.predict(X_valid)
print("Validation Set Performance:")
print(classification_report(y_valid, y_valid_pred))


Parameters: { "use_label_encoder" } are not used.



Train Set Performance:
              precision    recall  f1-score   support

           0       0.74      0.73      0.73      1498
           1       0.95      0.97      0.96      1162
           2       0.57      0.73      0.64       753
           3       0.97      0.91      0.94       594
           4       0.72      0.84      0.78       555
           5       0.97      0.96      0.96       497
           6       0.88      0.75      0.81       455
           7       0.99      0.99      0.99      1256
           8       0.77      0.82      0.79       932
           9       0.88      0.75      0.81       981
          10       0.95      0.42      0.58      1384
          11       0.70      0.98      0.82      1880
          12       0.85      0.79      0.82      1157

    accuracy                           0.81     13104
   macro avg       0.84      0.82      0.82     13104
weighted avg       0.84      0.81      0.81     13104

Validation Set Performance:
              precision    r

In [7]:
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 그리드 설정
param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# GridSearchCV로 최적의 파라미터 찾기
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# 최적의 파라미터와 점수 출력
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Validation Score:", grid_search.best_score_)


Fitting 3 folds for each of 540 candidates, totalling 1620 fits


Parameters: { "use_label_encoder" } are not used.



Best Hyperparameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
Best Validation Score: 0.7018467643467643


In [8]:
# 최적의 파라미터로 XGBoost 모델 재학습
best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X_train, y_train)

# 학습 데이터 평가
y_train_pred = best_xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Set Performance:")
print(classification_report(y_train, y_train_pred))
print("Train Accuracy: {:.3f}".format(train_accuracy))

# 검증 데이터 평가
y_valid_pred = best_xgb_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print("Validation Set Performance:")
print(classification_report(y_valid, y_valid_pred))
print("Validation Accuracy: {:.3f}".format(valid_accuracy))


Parameters: { "use_label_encoder" } are not used.



Train Set Performance:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1498
           1       0.96      0.97      0.96      1162
           2       0.57      0.74      0.64       753
           3       0.97      0.91      0.94       594
           4       0.73      0.85      0.78       555
           5       0.97      0.96      0.96       497
           6       0.87      0.76      0.81       455
           7       0.99      0.99      0.99      1256
           8       0.78      0.83      0.80       932
           9       0.91      0.77      0.83       981
          10       0.95      0.42      0.59      1384
          11       0.70      0.98      0.82      1880
          12       0.88      0.80      0.84      1157

    accuracy                           0.82     13104
   macro avg       0.85      0.82      0.82     13104
weighted avg       0.84      0.82      0.82     13104

Train Accuracy: 0.821
Validation Set Performance:
      

In [9]:
# 테스트 데이터 평가
y_test_pred = best_xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Set Performance:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy: {:.3f}".format(test_accuracy))


Test Set Performance:
              precision    recall  f1-score   support

           0       0.53      0.54      0.54       485
           1       0.86      0.89      0.87       436
           2       0.40      0.58      0.47       214
           3       0.88      0.88      0.88       208
           4       0.60      0.60      0.60       209
           5       0.86      0.90      0.88       149
           6       0.57      0.35      0.43       149
           7       0.97      0.97      0.97       421
           8       0.68      0.65      0.66       338
           9       0.67      0.60      0.63       319
          10       0.87      0.39      0.54       424
          11       0.71      0.98      0.82       613
          12       0.66      0.62      0.64       403

    accuracy                           0.71      4368
   macro avg       0.71      0.69      0.69      4368
weighted avg       0.72      0.71      0.70      4368

Test Accuracy: 0.711
