## 모델링

In [1]:
import pandas as pd
import numpy as np

In [4]:
file_path = r'C:\Users\Administrator\Desktop\대학\3학년 1학기\데이터마이닝\프로젝트\new_df.csv'
df = pd.read_csv(file_path, encoding = 'CP949')
df['preprocessed_송출내용'] = df['preprocessed_송출내용'].fillna('')
texts = df['preprocessed_송출내용']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=30)
vectored_df = vectorizer.fit_transform(texts)


dense_df = vectored_df.todense() #vectored_df는 희소행렬이기 때문에 dense 형태로 전환.
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(dense_df, columns=feature_names)


# 데이터분할

In [7]:
from sklearn.model_selection import train_test_split

X = df_tfidf
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 30) y_train shape: (5608,)
X_valid shape: (1870, 30) y_valid shape: (1870,)
X_test shape: (1870, 30) y_test shape: (1870,)


## 나이브 베이즈


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# 나이브 베이즈 모델 생성 및 훈련
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# 검증 세트로 예측 및 성능 평가
y_valid_pred = nb_model.predict(X_valid)
print("Validation Set Performance:")
print(metrics.classification_report(y_valid, y_valid_pred))

y_test_pred = nb_model.predict(X_test)
print("Validation Set Performance:")
print(metrics.classification_report(y_valid, y_valid_pred))


Validation Set Performance:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.67      0.41      0.51       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
           4       0.55      0.69      0.61       607
           5       0.74      0.90      0.81       464
           6       0.65      0.87      0.74       194
           7       0.76      0.17      0.28       151

    accuracy                           0.66      1870
   macro avg       0.62      0.50      0.50      1870
weighted avg       0.66      0.66      0.62      1870

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.67      0.41      0.51       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
           4       0.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 하이퍼파라미터 튜닝 

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Multinomial Naive Bayes 모델 생성 및 하이퍼파라미터 튜닝
param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]
    }

nb_model = MultinomialNB()

grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)


# 검증 데이터에 대한 성능 평가
best_nb_model = grid_search.best_estimator_
y_valid_pred = best_nb_model.predict(X_valid)
print("Validation Set Performance with nb_model:")
print(classification_report(y_valid, y_valid_pred))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Validation Set Performance with nb_model:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.68      0.57      0.62       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
           4       0.58      0.68      0.62       607
           5       0.74      0.90      0.81       464
           6       0.65      0.87      0.74       194
           7       0.76      0.17      0.28       151

    accuracy                           0.67      1870
   macro avg       0.63      0.52      0.51      1870
weighted avg       0.66      0.67      0.64      1870



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# 최적 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'alpha': 0.01, 'fit_prior': True}


In [22]:
# 최적 하이퍼파라미터로 train과 valid 데이터를 합쳐서 최종 모델 학습

best_nb_model = MultinomialNB(alpha=grid_search.best_params_['alpha'], fit_prior=grid_search.best_params_['fit_prior'])
best_nb_model.fit(X_train, y_train)

# 모델 평가 (train 데이터)
y_train_hat = best_nb_model.predict(X_train)
final_train_accuracy = accuracy_score(y_train, y_train_hat)
print("train Set Performance:")
print(classification_report(y_train, y_train_hat))
print("train set accuracy: %.3f" % final_train_accuracy)

# 모델 평가 (valid 데이터)
y_valid_final_pred = best_nb_model.predict(X_valid)
final_valid_accuracy = accuracy_score(y_valid, y_valid_final_pred)
print("valid Set Performance:")
print(classification_report(y_valid, y_valid_final_pred))
print("valid set accuracy: %.3f" % final_valid_accuracy)


train Set Performance:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       458
           1       0.66      0.61      0.64       465
           2       0.82      0.15      0.25       219
           3       0.91      0.06      0.12       154
           4       0.59      0.67      0.63      1819
           5       0.74      0.90      0.81      1386
           6       0.73      0.84      0.78       653
           7       0.89      0.22      0.36       454

    accuracy                           0.69      5608
   macro avg       0.78      0.55      0.56      5608
weighted avg       0.71      0.69      0.66      5608

train set accuracy: 0.689
valid Set Performance:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       152
           1       0.68      0.57      0.62       162
           2       0.70      0.08      0.14        89
           3       0.00      0.00      0.00        51
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
