In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# CSV 파일 읽기
df = pd.read_csv("online_shoppers_intention.csv")

# 타깃 변수 Revenue를 0/1로 변환
df['Revenue'] = df['Revenue'].astype(int)

# 수치형/범주형 피처 구분
numeric_features = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates',
    'PageValues', 'SpecialDay'
]

categorical_features = [
    'Month', 'OperatingSystems', 'Browser',
    'Region', 'TrafficType', 'VisitorType', 'Weekend'
]

# X, y 분리
X = df[numeric_features + categorical_features]
y = df['Revenue']

# 전처리: 수치형 스케일링, 범주형 OneHot
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# 학습/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 파이프라인 구성 (전처리 + 분류기)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 학습
clf.fit(X_train, y_train)

# 예측 및 평가
y_pred = clf.predict(X_test)
print("=== Accuracy ===")
print(accuracy_score(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))


=== Accuracy ===
0.8901054339010543

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2055
           1       0.76      0.49      0.60       411

    accuracy                           0.89      2466
   macro avg       0.83      0.73      0.77      2466
weighted avg       0.88      0.89      0.88      2466

