In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("new_df.csv", encoding='utf-8')

### 주어진 df ->  tf-idf 벡터화

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 텍스트 데이터와 라벨 분리
texts = df['preprocessed_text']  # 'new_labeled_data' 대신 'df' 사용
labels = df['label']  # 동일하게 'df'에서 라벨 가져오기

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(max_features=30)  # 상위 30개 단어만 사용
vectored_df = vectorizer.fit_transform(texts)

# 희소 행렬 -> 밀집 행렬 변환
dense_df = vectored_df.todense()

# 특징 이름 가져오기
feature_names = vectorizer.get_feature_names_out()

# DataFrame으로 변환
df_tfidf = pd.DataFrame(dense_df, columns=feature_names)

print("TF-IDF 벡터화 결과:")
print(df_tfidf)


TF-IDF 벡터화 결과:
        건강        검정   경보       경찰청   관리   그늘   금지        되다  물놀이       바라다  \
0      0.0  0.000000  0.0  0.707698  0.0  0.0  0.0  0.000000  0.0  0.000000   
1      0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.313276   
2      0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.855446  0.0  0.517892   
3      0.0  0.627727  0.0  0.550896  0.0  0.0  0.0  0.000000  0.0  0.000000   
4      0.0  0.849887  0.0  0.372932  0.0  0.0  0.0  0.000000  0.0  0.000000   
...    ...       ...  ...       ...  ...  ...  ...       ...  ...       ...   
21835  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.171697   
21836  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.654018   
21837  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.733854  0.0  0.444279   
21838  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.491296   
21839  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.654018   

       ...   준수        지역   착용      

### 데이터분할

In [12]:
from sklearn.model_selection import train_test_split

# Train/Validation/Test 데이터 분리
X = df_tfidf  # TF-IDF 벡터화된 데이터
y = df['label']  # 라벨 데이터

# 먼저 Train/Test+Validation으로 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Validation/Test로 나누기
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Train/Validation/Test 데이터 분리 완료")
print(f"훈련 데이터 크기: {X_train.shape}, {y_train.shape}")
print(f"검증 데이터 크기: {X_valid.shape}, {y_valid.shape}")
print(f"테스트 데이터 크기: {X_test.shape}, {y_test.shape}")


Train/Validation/Test 데이터 분리 완료
훈련 데이터 크기: (13104, 30), (13104,)
검증 데이터 크기: (4368, 30), (4368,)
테스트 데이터 크기: (4368, 30), (4368,)


#랜덤포레스트 모델 학습

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Random Forest 기본 모델 학습
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# 검증 데이터 예측
y_valid_hat = rf.predict(X_valid)

# 검증 데이터 평가
valid_accuracy = accuracy_score(y_valid, y_valid_hat)
print("Validation Score: {:.3f}".format(valid_accuracy))
print("Classification Report:\n", classification_report(y_valid, y_valid_hat))


Validation Score: 0.723
Classification Report:
               precision    recall  f1-score   support

           2       0.54      0.53      0.53       498
           4       0.86      0.91      0.89       400
           8       0.51      0.55      0.53       247
           9       0.91      0.89      0.90       235
          10       0.56      0.60      0.57       168
          13       0.91      0.87      0.89       150
          18       0.68      0.51      0.58       162
          30       0.96      0.98      0.97       433
          31       0.66      0.74      0.70       318
          32       0.64      0.57      0.60       313
          33       0.87      0.42      0.57       449
          34       0.71      0.97      0.82       616
          35       0.64      0.66      0.65       379

    accuracy                           0.72      4368
   macro avg       0.73      0.71      0.71      4368
weighted avg       0.73      0.72      0.71      4368



#하이퍼파라미터 설정

In [14]:
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],          # 트리 개수
    'max_features': ['auto', 'sqrt', 'log2'], # 피처 선택 방식
    'max_depth': [None, 10, 20, 30],          # 트리 최대 깊이
    'min_samples_split': [2, 5, 10],          # 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 2, 4]             # 리프 노드 최소 샘플 수
}

# GridSearchCV 설정
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# 학습 수행
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 최고 점수 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Validation Score: {:.3f}".format(grid_search.best_score_))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Validation Score: 0.711


#최적 파라미터로 재학습

In [15]:
# 최적의 파라미터로 Random Forest 재학습
best_params = grid_search.best_params_
rf_optimized = RandomForestClassifier(**best_params, random_state=42)
rf_optimized.fit(X_train, y_train)

# 학습 데이터와 검증 데이터 예측
y_train_hat = rf_optimized.predict(X_train)
y_valid_hat = rf_optimized.predict(X_valid)

# 학습 및 검증 정확도 계산
train_accuracy = accuracy_score(y_train, y_train_hat)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

# 결과 출력
print("Train Accuracy: {:.3f}".format(train_accuracy))
print("Validation Accuracy: {:.3f}".format(valid_accuracy))
print("Classification Report (Validation):\n", classification_report(y_valid, y_valid_hat))


Train Accuracy: 0.831
Validation Accuracy: 0.722
Classification Report (Validation):
               precision    recall  f1-score   support

           2       0.53      0.54      0.54       498
           4       0.85      0.92      0.88       400
           8       0.51      0.55      0.53       247
           9       0.90      0.89      0.89       235
          10       0.57      0.62      0.59       168
          13       0.93      0.85      0.89       150
          18       0.69      0.49      0.58       162
          30       0.97      0.98      0.97       433
          31       0.66      0.71      0.68       318
          32       0.67      0.57      0.62       313
          33       0.88      0.42      0.57       449
          34       0.71      0.97      0.82       616
          35       0.64      0.65      0.64       379

    accuracy                           0.72      4368
   macro avg       0.73      0.70      0.71      4368
weighted avg       0.73      0.72      0.71     