# I. data 준비

### 1) 최종적으로 cluster된 파일 불러오기

In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\Administrator\Desktop\대학\3학년 1학기\데이터마이닝\프로젝트\new_df.csv", encoding = 'cp949') 
#재난분류문자_클러스스터링_최종.ipynb 파일으로부터 나온 파일.

### 2) 데이터 분할

In [2]:
from sklearn.model_selection import train_test_split

# 'label' 열을 삭제하고 결과를 df 변수에 저장
X = df.drop('label', axis=1)
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 1) y_train shape: (5608,)
X_valid shape: (1870, 1) y_valid shape: (1870,)
X_test shape: (1870, 1) y_test shape: (1870,)


### 3) 분할된 각 set ->  tf-idf 벡터화

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


**3-1) train set**

In [4]:
# train 데이터셋 벡터화
train_texts = X_train['preprocessed_송출내용'].fillna('')
vectorizer = TfidfVectorizer(max_features=30)
vectored_train = vectorizer.fit_transform(train_texts).todense()
feature_names = vectorizer.get_feature_names_out()
X_train_vectored = pd.DataFrame(vectored_train, columns=feature_names)

**3-2) valid set**

In [5]:
# valid 데이터셋 벡터화
valid_texts = X_valid['preprocessed_송출내용'].fillna('')
vectored_valid = vectorizer.transform(valid_texts).todense()
X_valid_vectored = pd.DataFrame(vectored_valid, columns=feature_names)

**3-3) test set**

In [6]:
# test 데이터셋 벡터화
test_texts = X_test['preprocessed_송출내용'].fillna('')
vectored_test = vectorizer.transform(test_texts).todense()
X_test_vectored = pd.DataFrame(vectored_test, columns=feature_names)


### 4) set별 data scaling (model에 따라 scaled 된 data가 필요한 경우가 있음)

In [7]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_vectored)
X_valid_scaled = scaler.transform(X_valid_vectored)
X_test_scaled = scaler.transform(X_test_vectored)


# ----------------------------------------------------------------------------

# II. 모델 적용

## 1. SVM

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svc = SVC()
svc.fit(X_train_scaled, y_train)

y_train_hat = svc.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = svc.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.870
valid score: 0.817


###  1) svc 하이퍼파라미터 튜닝 

**for문으로 탐색.** <br/>
**1-1) C, gamma 조절**

In [9]:
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, random_state=20).fit(X_train_scaled, y_train) #C, gamma 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.809201,0.783957
1,0.1,0.01,0.756419,0.731551
2,0.1,0.001,0.600036,0.574866
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.901213,0.832086
5,1.0,0.01,0.829886,0.792513
6,1.0,0.001,0.748395,0.725668
7,1.0,0.0001,0.604315,0.57754
8,10.0,0.1,0.925999,0.839037
9,10.0,0.01,0.88035,0.817112


**최적 hyperparameter:  kernel = poly, C = 50.0, gamma=0.1**

### 2) 최적의 hyperparameter를 적용한 SVC model 및 평가

In [10]:
#모델 적용
best_svc = SVC(C=50, gamma=0.1).fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_svc.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_svc.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.927
valid score: 0.839


In [11]:
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       152
           1       0.73      0.75      0.74       162
           2       0.64      0.61      0.62        89
           3       0.70      0.59      0.64        51
           4       0.81      0.83      0.82       607
           5       0.97      0.95      0.96       464
           6       0.77      0.86      0.81       194
           7       0.84      0.75      0.79       151

    accuracy                           0.84      1870
   macro avg       0.80      0.78      0.79      1870
weighted avg       0.84      0.84      0.84      1870



## 2. RandomForest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier().fit(X_train_scaled, y_train)

y_train_hat = rf.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = rf.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.927
valid score: 0.833


### 1) random forest 하이퍼파라미터 튜닝

**gird search로 조정**

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=0, n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

  warn(


Best parameters: {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.8414766110734385


### 2) 최적의 hyperparameter를 적용한 RandomForest model 및 평가

In [15]:
#모델 적용
best_rf = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_features=grid_search.best_params_['max_features'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
best_rf.fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_rf.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_rf.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

  warn(


train score: 0.923
valid score: 0.834


In [16]:
#상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.93      0.90      0.92       152
           1       0.73      0.75      0.74       162
           2       0.79      0.56      0.66        89
           3       0.67      0.59      0.62        51
           4       0.77      0.85      0.81       607
           5       0.97      0.92      0.94       464
           6       0.79      0.87      0.82       194
           7       0.89      0.70      0.79       151

    accuracy                           0.83      1870
   macro avg       0.82      0.77      0.79      1870
weighted avg       0.84      0.83      0.83      1870



## 3. XGBoost

In [17]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [18]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xg.fit(X_train_scaled, y_train)

y_train_hat = xg.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = xg.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.926
valid score: 0.828


### 1) XGBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xg= xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xg, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.6}
Best score: 0.8386236074273086


### 2) 최적의 hyperparameter를 적용한 XGBoost model 및 평가

In [20]:
#모델 적용                                                                                                                                                                                                                        4                                                   
best_xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    n_estimators=grid_search.best_params_['n_estimators'],
    subsample=grid_search.best_params_['subsample'],
    colsample_bytree=grid_search.best_params_['colsample_bytree'])
best_xg.fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_xg.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_xg.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.923
valid score: 0.828


In [21]:
    #상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       152
           1       0.71      0.74      0.73       162
           2       0.74      0.56      0.64        89
           3       0.60      0.55      0.57        51
           4       0.79      0.83      0.80       607
           5       0.96      0.93      0.95       464
           6       0.77      0.87      0.82       194
           7       0.85      0.73      0.78       151

    accuracy                           0.83      1870
   macro avg       0.79      0.76      0.77      1870
weighted avg       0.83      0.83      0.83      1870



## 4. CatBoost

In [22]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [23]:
from catboost import CatBoostClassifier
from sklearn import metrics  
from sklearn.metrics import accuracy_score, classification_report

cat = CatBoostClassifier(loss_function='MultiClass', verbose=0)
cat.fit(X_train_scaled, y_train, eval_set=(X_valid_scaled, y_valid))

y_train_hat = cat.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = cat.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.901
valid score: 0.824


### 1) CatBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5]
}
cat = CatBoostClassifier(early_stopping_rounds=50, verbose=False)
grid_search = GridSearchCV(cat, param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'depth': 10, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.05}
Best score: 0.84


### 2) 최적의 hyperparameter를 적용한 CatBoost model 및 평가

In [25]:
#모델 적용
best_cat = CatBoostClassifier(verbose=0,
    iterations=grid_search.best_params_['iterations'],
    learning_rate=grid_search.best_params_['learning_rate'],
    depth=grid_search.best_params_['depth'],
    l2_leaf_reg=grid_search.best_params_['l2_leaf_reg'])
best_cat.fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_cat.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_cat.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.919
valid score: 0.833


In [26]:
#상세 보고서
print(metrics.classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92       152
           1       0.71      0.76      0.73       162
           2       0.76      0.57      0.65        89
           3       0.63      0.61      0.62        51
           4       0.79      0.83      0.81       607
           5       0.95      0.93      0.94       464
           6       0.78      0.87      0.82       194
           7       0.88      0.75      0.81       151

    accuracy                           0.83      1870
   macro avg       0.80      0.78      0.79      1870
weighted avg       0.83      0.83      0.83      1870



# III. 최종 모델 선정

각모델의 valid score<br/><br/>

svc = 0.830<br/>
나이브베이즈 = 0.672<br/>
랜덤포레스트 = 0.830<br/>
XGBoost = 0.824<br/>
catBoost = 0.828 

**나이브 베이즈를 제외한 4가지 모델의 성능이0.01정도의 차이를 보이므로 cross validuation으로 재평가 (fold=5)**

### 최종 학습 데이터셋, 테스트 데이터셋


In [27]:
import numpy as np

# 최종 학습데이터
# X_train_scaled와 X_valid_scaled 합치기
X_train_final = np.vstack((X_train_scaled, X_valid_scaled))
y_train_final = np.hstack((y_train, y_valid))


In [28]:
from sklearn.model_selection import cross_val_score

#모델별 5개의 fold로 교차검증
svc_cv_score = cross_val_score(best_svc, X_train_final, y_train_final, cv=5) 
print("svc model cross validation score: %.3f" %svc_cv_score.mean())

rf_cv_score = cross_val_score(best_rf, X_train_final, y_train_final, cv=5) 
print("rf model cross validation score: %.3f" %rf_cv_score.mean())

xg_cv_score = cross_val_score(best_xg, X_train_final, y_train_final, cv=5) 
print("XGboost model cross validation score: %.3f" %xg_cv_score.mean())

cat_cv_score = cross_val_score(best_cat, X_train_final, y_train_final, cv=5) 
print("CatBoost model cross validation score: %.3f" %cat_cv_score.mean())

svc model cross validation score: 0.848


  warn(
  warn(
  warn(
  warn(
  warn(


rf model cross validation score: 0.843
XGboost model cross validation score: 0.840
CatBoost model cross validation score: 0.846


**['depth': 10, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05]일때의 catBoost 모델이 성능이 가장 높은 것을 알 수 있다.**

### best model 생성 및 최종 평가

In [30]:
#bestmodel 생성
best_model = CatBoostClassifier(depth=10, iterations=500, l2_leaf_reg=3, learning_rate=0.05)
best_model.fit(X_train_final, y_train_final)

0:	learn: 1.9234594	total: 1.23s	remaining: 10m 11s
1:	learn: 1.7953078	total: 2.4s	remaining: 9m 58s
2:	learn: 1.6883976	total: 3.65s	remaining: 10m 4s
3:	learn: 1.5959772	total: 5.06s	remaining: 10m 27s
4:	learn: 1.5188433	total: 6.81s	remaining: 11m 14s
5:	learn: 1.4509323	total: 8.31s	remaining: 11m 24s
6:	learn: 1.3911064	total: 9.47s	remaining: 11m 6s
7:	learn: 1.3371277	total: 10.4s	remaining: 10m 42s
8:	learn: 1.2884664	total: 11.4s	remaining: 10m 23s
9:	learn: 1.2425742	total: 12.7s	remaining: 10m 21s
10:	learn: 1.2003470	total: 13.8s	remaining: 10m 15s
11:	learn: 1.1625503	total: 15s	remaining: 10m 11s
12:	learn: 1.1276101	total: 16.3s	remaining: 10m 10s
13:	learn: 1.0966410	total: 17.4s	remaining: 10m 3s
14:	learn: 1.0659643	total: 18.4s	remaining: 9m 54s
15:	learn: 1.0380744	total: 19.4s	remaining: 9m 46s
16:	learn: 1.0109761	total: 20.4s	remaining: 9m 38s
17:	learn: 0.9864258	total: 21.4s	remaining: 9m 31s
18:	learn: 0.9630304	total: 22.4s	remaining: 9m 27s
19:	learn: 0.94

<catboost.core.CatBoostClassifier at 0x2c894cb0fd0>

### 최종 test score

In [31]:
y_train_hat = best_model.predict(X_train_final)
train_accuracy = accuracy_score(y_train_final, y_train_hat)

y_test_hat = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_hat)

print("train score: %.3f" %train_accuracy)
print("test score: %.3f" %test_accuracy)

train score: 0.895
test score: 0.844


In [32]:
# 최종 모델의 precsion, recall, f1-score
print(classification_report(y_train_final, y_train_hat))
print(classification_report(y_test, y_test_hat))


              precision    recall  f1-score   support

           0       0.96      0.95      0.95       610
           1       0.75      0.86      0.80       627
           2       0.87      0.68      0.76       308
           3       0.70      0.76      0.72       205
           4       0.88      0.88      0.88      2426
           5       0.97      0.97      0.97      1850
           6       0.90      0.90      0.90       847
           7       0.92      0.85      0.88       605

    accuracy                           0.90      7478
   macro avg       0.87      0.86      0.86      7478
weighted avg       0.90      0.90      0.90      7478

              precision    recall  f1-score   support

           0       0.90      0.95      0.93       143
           1       0.69      0.77      0.73       157
           2       0.71      0.61      0.66        77
           3       0.65      0.75      0.70        57
           4       0.80      0.81      0.81       596
           5       0.95 

**그러나 catBoost 모델의 경우 계산 비용이 너무커서 random forest 모델을 쓰는 것이 좋을 수도 있겠다.**