# I. data 준비

### 1) 최종적으로 cluster된 파일 불러오기

In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\qls05\OneDrive\바탕 화면\df.csv", encoding = 'cp949') 
#000.ipynb로부터 나온 파일임 github 링크까지 첨부하면 좋을듯

  from pandas.core.computation.check import NUMEXPR_INSTALLED


### 2) 불러온 df ->  tf-idf 벡터화

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['preprocessed_송출내용'].fillna('')
vectorizer = TfidfVectorizer(max_features=30)
vectored_df = vectorizer.fit_transform(texts)

In [3]:
dense_df = vectored_df.todense() #vectored_df는 희소행렬이기 때문에 dense 형태로 전환.

feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(dense_df, columns=feature_names)

### 3) 데이터 분할

In [4]:
from sklearn.model_selection import train_test_split

X = df_tfidf
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 30) y_train shape: (5608,)
X_valid shape: (1870, 30) y_valid shape: (1870,)
X_test shape: (1870, 30) y_test shape: (1870,)


### 4) data scaling (model에 따라 scaled 된 data가 필요한 경우가 있음)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# ----------------------------------------------------------------------------

# II. 모델 적용

## 1. SVM

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svc = SVC()
svc.fit(X_train, y_train)

y_train_hat = svc.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = svc.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.863
valid score: 0.814


###  1) svc 하이퍼파라미터 튜닝 

**for문으로 탐색.** <br/>
**1-1) C, gamma 조절**

In [11]:
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, random_state=20).fit(X_train_scaled, y_train) #C, gamma 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.810093,0.783957
1,0.1,0.01,0.759094,0.734225
2,0.1,0.001,0.596113,0.572727
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.895506,0.824064
5,1.0,0.01,0.826498,0.788235
6,1.0,0.001,0.748039,0.724064
7,1.0,0.0001,0.603067,0.579144
8,10.0,0.1,0.921362,0.829947
9,10.0,0.01,0.87393,0.815508


**1-2) kernel 조절**

In [12]:
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
results = []

for kernel in kernels:
        svc = SVC(kernel=kernel, random_state=20).fit(X_train_scaled, y_train) #kernel 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'kernel': kernel,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,kernel,train_accuracy,valid_accuracy
0,linear,0.808131,0.770588
1,rbf,0.864836,0.813904
2,poly,0.882668,0.817647
3,sigmoid,0.62607,0.616043


**1-3) kernel = poly로 정하고 C랑 gamma 다시 튜닝**

In [13]:
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, kernel='poly', random_state=20).fit(X_train_scaled, y_train)

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.901213,0.818182
1,0.1,0.01,0.324358,0.324599
2,0.1,0.001,0.324358,0.324599
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.918688,0.826738
5,1.0,0.01,0.729494,0.711765
6,1.0,0.001,0.324358,0.324599
7,1.0,0.0001,0.324358,0.324599
8,10.0,0.1,0.921719,0.830481
9,10.0,0.01,0.846291,0.798396


**최적 hyperparameter:  kernel = poly, C = 50, gamma=0.1**

### 2) 최적의 hyperparameter를 적용한 SVC model 및 평가

In [14]:
#모델 적용
best_svc = SVC(kernel='poly', C=50, gamma=0.1).fit(X_train_scaled, y_train)

#모델 평가
y_train_hat = best_svc.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_svc.predict(X_valid_scaled)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.922
valid score: 0.830


In [15]:
print(classification_report(y_train, y_train_hat))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       458
           1       0.75      0.92      0.82       465
           2       0.73      0.84      0.78       219
           3       0.77      0.68      0.72       154
           4       0.96      0.89      0.92      1819
           5       0.99      0.99      0.99      1386
           6       0.92      0.92      0.92       653
           7       0.92      0.92      0.92       454

    accuracy                           0.92      5608
   macro avg       0.87      0.89      0.88      5608
weighted avg       0.93      0.92      0.92      5608



## 2. 나이브베이즈

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_train_hat = nb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = nb.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.683
valid score: 0.660


###  1) naive baise 하이퍼파라미터 튜닝 

**이후 gird search으로 탐색**

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]
    }

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'alpha': 0.01, 'fit_prior': True}
Best score: 0.6813487533249596


### 2) 최적의 hyperparameter를 적용한 Naive Baise model 및 평가

In [8]:
#모델 적용
best_nb = MultinomialNB(alpha=grid_search.best_params_['alpha'], fit_prior=grid_search.best_params_['fit_prior'])
best_nb.fit(X_train, y_train)

#모델 평가
y_train_hat = best_nb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_nb.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.689
valid score: 0.672


In [9]:
#상세보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.68      0.57      0.62       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
           4       0.58      0.68      0.62       607
           5       0.74      0.90      0.81       464
           6       0.65      0.87      0.74       194
           7       0.76      0.17      0.28       151

    accuracy                           0.67      1870
   macro avg       0.63      0.52      0.51      1870
weighted avg       0.66      0.67      0.64      1870



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3. RandomForest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier().fit(X_train, y_train)

y_train_hat = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.923
valid score: 0.828


### 1) random forest 하이퍼파라미터 튜닝

**gird search로 조정**

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=0, n_jobs=-1)

grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
130 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\qls05\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParame

Best parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.8389803997867448


### 2) 최적의 hyperparameter를 적용한 RandomForest model 및 평가

In [20]:
#모델 적용
best_rf = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_features=grid_search.best_params_['max_features'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
best_rf.fit(X_train, y_train)

#모델 평가
y_train_hat = best_rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.919
valid score: 0.830


In [22]:
#상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       152
           1       0.71      0.73      0.72       162
           2       0.73      0.57      0.64        89
           3       0.66      0.49      0.56        51
           4       0.77      0.86      0.81       607
           5       0.97      0.92      0.95       464
           6       0.79      0.85      0.82       194
           7       0.91      0.70      0.79       151

    accuracy                           0.83      1870
   macro avg       0.81      0.75      0.77      1870
weighted avg       0.83      0.83      0.83      1870



## 4. XGBoost

In [25]:
pip install xgboost

DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 99.8/99.8 MB 293.5 kB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Collecting xgboostNote: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063



  Using cached xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 99.8/99.8 MB 606.3 kB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [28]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xg.fit(X_train, y_train)

y_train_hat = xg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = xg.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.922
valid score: 0.827


### 1) XGBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xg= xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xg, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터와 최고의 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Best score: 0.8375539933372055


### 2) 최적의 hyperparameter를 적용한 XGBoost model 및 평가

In [31]:
#모델 적용                                                                                                                                                                                                                        4                                                   
best_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    n_estimators=grid_search.best_params_['n_estimators'],
    subsample=grid_search.best_params_['subsample'],
    colsample_bytree=grid_search.best_params_['colsample_bytree'])
best_xgb.fit(X_train, y_train)

#모델 평가
y_train_hat = best_xgb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_xgb.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.910
valid score: 0.824


In [33]:
#상세 보고서
print(classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       152
           1       0.71      0.73      0.72       162
           2       0.70      0.56      0.62        89
           3       0.69      0.47      0.56        51
           4       0.78      0.83      0.81       607
           5       0.95      0.92      0.94       464
           6       0.77      0.86      0.81       194
           7       0.82      0.74      0.78       151

    accuracy                           0.82      1870
   macro avg       0.79      0.75      0.77      1870
weighted avg       0.82      0.82      0.82      1870



## 5. CatBoost

In [34]:
pip install catboost



DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063





In [38]:
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

cat = CatBoostClassifier(loss_function='MultiClass', verbose=0)
cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))

y_train_hat = cat.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = cat.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

train score: 0.897
valid score: 0.822


### 1) CatBoost 하이퍼파라미터 튜닝

**gird search로 조정**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5]
}
cat = CatBoostClassifier(early_stopping_rounds=50, verbose=False)
grid_search = GridSearchCV(cat, param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score: {:.2f}".format(grid_search.best_score_))

### 2) 최적의 hyperparameter를 적용한 CatBoost model 및 평가

In [None]:
#모델 적용
best_cat = CatBoostClassifier(early_stopping_rounds=50,
    iterations=grid_search.best_params_['iterations'],
    learning_rate=grid_search.best_params_['learning_rate'],
    depth=grid_search.best_params_['depth'],
    l2_leaf_reg=grid_search.best_params_['l2_leaf_reg'],
best_cat.fit(X_train, y_train)

#모델 평가
y_train_hat = best_cat.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_valid_hat = best_cat.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("train score: %.3f" %train_accuracy)
print("valid score: %.3f" %valid_accuracy)

In [None]:
#상세 보고서
print(metrics.classification_report(y_valid, y_valid_hat))

# III. 최종 모델 선정

~한 연유로 00 model이 best model로 선정되었다. 이제 최종 test를 쳐보자

In [None]:
#bestmodel 생성
best_model = bestmodel()
best_model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split

#train: test = 6: 4로 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 

y_train_hat = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_hat)

y_test_hat = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_hat)

print("train score: %.3f" %train_accuracy)
print("test score: %.3f" %test_accuracy)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, X, y, cv=5) #5개의 fold로 교차검증
print("cross validation score: %.3f" %cv_scores)