In [7]:
import pandas as pd

file_path = r'C:\Users\Administrator\Desktop\대학\3학년 1학기\데이터마이닝\프로젝트\new_df.csv'
df = pd.read_csv(file_path, encoding = 'CP949')
df['preprocessed_송출내용'] = df['preprocessed_송출내용'].fillna('')
texts = df['preprocessed_송출내용']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=30)
vectored_df = vectorizer.fit_transform(texts)


dense_df = vectored_df.todense() #vectored_df는 희소행렬이기 때문에 dense 형태로 전환.
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(dense_df, columns=feature_names)


# 데이터분할

In [9]:
from sklearn.model_selection import train_test_split

X = df_tfidf
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 30) y_train shape: (5608,)
X_valid shape: (1870, 30) y_valid shape: (1870,)
X_test shape: (1870, 30) y_test shape: (1870,)


## xgboost

In [10]:
import xgboost as xgb
from sklearn import metrics

# XGBoost 모델 생성 및 훈련
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# 검증 세트로 예측 및 성능 평가
y_train_pred = xgb_model.predict(X_train)
print("Train Set Performance:")
print(metrics.classification_report(y_train, y_train_pred))

y_valid_pred = xgb_model.predict(X_valid)
print("Validation Set Performance:")
print(metrics.classification_report(y_valid, y_valid_pred))


Train Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       458
           1       0.75      0.91      0.82       465
           2       0.84      0.77      0.80       219
           3       0.75      0.69      0.72       154
           4       0.94      0.90      0.92      1819
           5       0.99      0.99      0.99      1386
           6       0.92      0.92      0.92       653
           7       0.92      0.92      0.92       454

    accuracy                           0.92      5608
   macro avg       0.88      0.88      0.88      5608
weighted avg       0.92      0.92      0.92      5608

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       152
           1       0.71      0.72      0.71       162
           2       0.70      0.55      0.62        89
           3       0.67      0.47      0.55        51
           4       0.78    

## 하이퍼파라미터 튜닝

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# XGBoost 모델 생성 및 하이퍼파라미터 튜닝
param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 검증 데이터에 대한 성능 평가
best_xgb_model = grid_search.best_estimator_
y_valid_pred = best_xgb_model.predict(X_valid)
print("Validation Set Performance with XGBoost:")
print(classification_report(y_valid, y_valid_pred))



Fitting 3 folds for each of 540 candidates, totalling 1620 fits
Validation Set Performance with XGBoost:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       152
           1       0.71      0.73      0.72       162
           2       0.70      0.56      0.62        89
           3       0.69      0.47      0.56        51
           4       0.78      0.83      0.81       607
           5       0.95      0.92      0.94       464
           6       0.77      0.86      0.81       194
           7       0.82      0.74      0.78       151

    accuracy                           0.82      1870
   macro avg       0.79      0.75      0.77      1870
weighted avg       0.82      0.82      0.82      1870



In [12]:
# 최적 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


In [13]:
# 최적 하이퍼파라미터로 train과 valid 데이터를 합쳐서 최종 모델 학습
best_xgb_model = xgb.XGBClassifier(
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    n_estimators=grid_search.best_params_['n_estimators'],
    subsample=grid_search.best_params_['subsample'],
    colsample_bytree=grid_search.best_params_['colsample_bytree'],
    use_label_encoder=False,
    eval_metric='mlogloss'
)
best_xgb_model.fit(X_train, y_train)

In [14]:
# 모델 평가 (train 데이터)
y_train_hat = best_xgb_model.predict(X_train)
final_train_accuracy = accuracy_score(y_train, y_train_hat)
print("Train Set Performance:")
print(classification_report(y_train, y_train_hat))
print("Train set accuracy: %.3f" % final_train_accuracy)

# 모델 평가 (valid 데이터)
y_valid_final_pred = best_xgb_model.predict(X_valid)
final_valid_accuracy = accuracy_score(y_valid, y_valid_final_pred)
print("Validation Set Performance:")
print(classification_report(y_valid, y_valid_final_pred))
print("Validation set accuracy: %.3f" % final_valid_accuracy)

Train Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       458
           1       0.75      0.88      0.81       465
           2       0.84      0.74      0.79       219
           3       0.73      0.68      0.71       154
           4       0.91      0.89      0.90      1819
           5       0.98      0.98      0.98      1386
           6       0.91      0.91      0.91       653
           7       0.93      0.88      0.90       454

    accuracy                           0.91      5608
   macro avg       0.88      0.87      0.87      5608
weighted avg       0.91      0.91      0.91      5608

Train set accuracy: 0.910
Validation Set Performance:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       152
           1       0.71      0.73      0.72       162
           2       0.70      0.56      0.62        89
           3       0.69      0.47      0.56        51
 

In [15]:
# 학습 데이터와 검증 데이터의 정확도 비교
print(f"Train Accuracy: {final_train_accuracy:.3f}")
print(f"Validation Accuracy: {final_valid_accuracy:.3f}")

Train Accuracy: 0.910
Validation Accuracy: 0.824
