In [15]:
df = pd.read_csv('new_df.csv')
df.columns

Index(['preprocessed_text', 'label'], dtype='object')

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 데이터셋 불러오기
df = pd.read_csv('new_df.csv')

# 특성과 레이블 분리
X = df['preprocessed_text']  # 'text_column'을 실제 텍스트 열 이름으로 변경하세요
y = df['label']  # 'label_column'을 실제 레이블 열 이름으로 변경하세요

# 데이터셋을 훈련, 검증, 테스트 세트로 분할
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# TF-IDF를 사용하여 텍스트 데이터 벡터화
vectorizer = TfidfVectorizer(max_features=30)
X_train_vec = vectorizer.fit_transform(X_train)
X_valid_vec = vectorizer.transform(X_valid)
X_test_vec = vectorizer.transform(X_test)

# 데이터 스케일링
scaler = StandardScaler(with_mean=False)  # 희소 행렬은 평균 중심화를 지원하지 않으므로 with_mean=False
X_train_scaled = scaler.fit_transform(X_train_vec)
X_valid_scaled = scaler.transform(X_valid_vec)
X_test_scaled = scaler.transform(X_test_vec)

In [17]:
# 최적의 하이퍼파라미터로 SVM 모델 훈련 및 평가
C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
kernel_results = []

for kernel in kernels:
    svc = SVC(kernel=kernel, random_state=20).fit(X_train_scaled, y_train)  # kernel 조정

    y_train_hat = svc.predict(X_train_scaled)
    y_valid_hat = svc.predict(X_valid_scaled)

    train_accuracy = accuracy_score(y_train, y_train_hat)
    valid_accuracy = accuracy_score(y_valid, y_valid_hat)

    kernel_results.append({'kernel': kernel,
                           'train_accuracy': train_accuracy,
                           'valid_accuracy': valid_accuracy})

# 결과 출력
kernel_tuning_results = pd.DataFrame(kernel_results)
display(kernel_tuning_results)


Unnamed: 0,kernel,train_accuracy,valid_accuracy
0,linear,0.634005,0.629808
1,rbf,0.70841,0.687729
2,poly,0.735958,0.692995
3,sigmoid,0.418956,0.420559


In [18]:
# SVM 모델 하이퍼파라미터 튜닝 - C와 gamma (kernel='poly')
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, kernel='poly', random_state=20).fit(X_train_scaled, y_train)

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat = svc.predict(X_valid_scaled)

        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

# 결과 출력
tuning_results = pd.DataFrame(results)
display(tuning_results)

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.760989,0.699863
1,0.1,0.01,0.456807,0.465659
2,0.1,0.001,0.143468,0.141026
3,0.1,0.0001,0.143468,0.141026
4,1.0,0.1,0.801053,0.701465
5,1.0,0.01,0.624847,0.625687
6,1.0,0.001,0.143468,0.141026
7,1.0,0.0001,0.143468,0.141026
8,10.0,0.1,0.817689,0.692537
9,10.0,0.01,0.699863,0.682234


In [19]:
# 최적의 하이퍼파라미터로 SVM 모델 훈련 및 평가
best_result = tuning_results.sort_values(by='valid_accuracy', ascending=False).iloc[0]
C_best = best_result['C']
gamma_best = best_result['gamma']

svm_model = SVC(C=C_best, gamma=gamma_best, kernel='poly', random_state=42)
svm_model.fit(X_train_scaled, y_train)

In [20]:
# 모델 평가 (최적의 C와 gamma, kernel='poly')
y_train_pred = svm_model.predict(X_train_scaled)
y_valid_pred = svm_model.predict(X_valid_scaled)
y_test_pred = svm_model.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'훈련 정확도: {train_accuracy:.3f}')
print(f'검증 정확도: {valid_accuracy:.3f}')
print(f'테스트 정확도: {test_accuracy:.3f}')

# 검증 세트에 대한 분류 리포트 출력
print("\n검증 세트 분류 리포트:\n")
print(classification_report(y_valid, y_valid_pred))

# 테스트 세트에 대한 분류 리포트 출력
print("\n테스트 세트 분류 리포트:\n")
print(classification_report(y_test, y_test_pred))

훈련 정확도: 0.801
검증 정확도: 0.701
테스트 정확도: 0.701

검증 세트 분류 리포트:

              precision    recall  f1-score   support

           2       0.48      0.53      0.50       498
           4       0.86      0.83      0.85       400
           8       0.48      0.56      0.52       247
           9       0.88      0.90      0.89       235
          10       0.50      0.61      0.55       168
          13       0.80      0.87      0.84       150
          18       0.68      0.49      0.57       162
          30       0.95      0.98      0.96       433
          31       0.63      0.69      0.66       318
          32       0.66      0.53      0.59       313
          33       0.93      0.39      0.55       449
          34       0.71      0.99      0.83       616
          35       0.64      0.55      0.59       379

    accuracy                           0.70      4368
   macro avg       0.71      0.69      0.68      4368
weighted avg       0.72      0.70      0.69      4368


테스트 세트 분류 리포트:

   