In [1]:
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
df = pd.read_csv(r"C:\Users\qls05\OneDrive\바탕 화면\df.csv", encoding = 'cp949')

df['preprocessed_송출내용'] = df['preprocessed_송출내용'].fillna('')
texts = df['preprocessed_송출내용']

### 주어진 df ->  tf-idf 벡터화

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=30)
vectored_df = vectorizer.fit_transform(texts)

In [4]:
dense_df = vectored_df.todense() #vectored_df는 희소행렬이기 때문에 dense 형태로 전환.

feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(dense_df, columns=feature_names)

### 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split

X = df_tfidf
y = df['label']

# train : val : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)

X_train shape: (5608, 30) y_train shape: (5608,)
X_valid shape: (1870, 30) y_valid shape: (1870,)
X_test shape: (1870, 30) y_test shape: (1870,)


### data scaling

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# ----------------------------------------------------------------------------

## SVM 모델 적용

In [18]:
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)


y_valid_hat = svc.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_hat)

print("valid score: %.3f" %valid_accuracy)

valid score: 0.767


In [15]:
#상세 보고서
print(metrics.classification_report(y_valid, y_valid_hat))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       152
           1       0.68      0.72      0.70       162
           2       0.77      0.55      0.64        89
           3       0.52      0.45      0.48        51
           4       0.70      0.76      0.73       607
           5       0.90      0.88      0.89       464
           6       0.73      0.86      0.79       194
           7       0.74      0.46      0.57       151

    accuracy                           0.77      1870
   macro avg       0.74      0.70      0.71      1870
weighted avg       0.77      0.77      0.76      1870



## hyperparmeter 튜닝

**1. C, gamma 조절**

In [17]:
from sklearn.svm import SVC

C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, random_state=20).fit(X_train_scaled, y_train) #C, gamma 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.810093,0.783957
1,0.1,0.01,0.759094,0.734225
2,0.1,0.001,0.596113,0.572727
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.895506,0.824064
5,1.0,0.01,0.826498,0.788235
6,1.0,0.001,0.748039,0.724064
7,1.0,0.0001,0.603067,0.579144
8,10.0,0.1,0.921362,0.829947
9,10.0,0.01,0.87393,0.815508


**2. kernel 조정**

In [25]:
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
results = []

for kernel in kernels:
        svc = SVC(kernel=kernel, random_state=20).fit(X_train_scaled, y_train) #kernel 조정

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'kernel': kernel,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,kernel,train_accuracy,valid_accuracy
0,linear,0.808131,0.770588
1,rbf,0.864836,0.813904
2,poly,0.882668,0.817647
3,sigmoid,0.62607,0.616043


**3. kernel = poly로 정하고 C랑 gamma 다시 튜닝**

In [27]:
from sklearn.svm import SVC

C_settings = [0.1, 1, 10, 50]
gamma_settings = [0.1, 0.01, 0.001, 0.0001]
results = []

for C in C_settings:
    for gamma in gamma_settings:
        svc = SVC(C=C, gamma=gamma, kernel='poly', random_state=20).fit(X_train_scaled, y_train)

        y_train_hat = svc.predict(X_train_scaled)
        y_valid_hat =svc.predict(X_valid_scaled)
        
        train_accuracy = accuracy_score(y_train, y_train_hat)
        valid_accuracy = accuracy_score(y_valid, y_valid_hat)
        

        results.append({'C': C,
                        'gamma': gamma,
                        'train_accuracy': train_accuracy,
                        'valid_accuracy': valid_accuracy})

display(pd.DataFrame(results))

Unnamed: 0,C,gamma,train_accuracy,valid_accuracy
0,0.1,0.1,0.901213,0.818182
1,0.1,0.01,0.324358,0.324599
2,0.1,0.001,0.324358,0.324599
3,0.1,0.0001,0.324358,0.324599
4,1.0,0.1,0.918688,0.826738
5,1.0,0.01,0.729494,0.711765
6,1.0,0.001,0.324358,0.324599
7,1.0,0.0001,0.324358,0.324599
8,10.0,0.1,0.921719,0.830481
9,10.0,0.01,0.846291,0.798396


**4. 최종 hyperparameter 선정 > kernel = poly, C = 50, gamma=0.1**

## test score

In [29]:
svc = SVC(C=50.0, gamma=0.1, kernel='poly').fit(X_train_scaled, y_train)

y_test_hat =svc.predict(X_test_scaled)
        
test_accuracy = accuracy_score(y_test, y_test_hat)
print("test score: %.3f" %test_accuracy)

test score: 0.839
