In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

file_path = "car_evaluation.csv"  # 로컬 파일 경로 지정

df = pd.read_csv(file_path)
print(df.columns)

# 결측치 확인 및 처리
df.fillna(method='ffill', inplace=True)

# 불필요한 컬럼 제거 (없음)

# 엔코딩 (모든 컬럼을 숫자로 변환)
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# 변환 전 레이블 분포
before_encoding = df.iloc[:, -1].value_counts()

# 데이터 분할
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 모델 리스트
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# 모델 학습 및 평가
results = {}
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    results[name] = accuracy
    conf_matrices[name] = conf_matrix

# 변환 후 레이블 분포
after_encoding = df.iloc[:, -1].value_counts()

# 결과 출력
print("Accuracy Results:", results)
print("Before Encoding Label Distribution:\n", before_encoding)
print("After Encoding Label Distribution:\n", after_encoding)
print("Confusion Matrices:\n", conf_matrices)

Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')
Accuracy Results: {'Random Forest': 0.9855491329479769, 'Decision Tree': 0.9913294797687862, 'Logistic Regression': 0.6820809248554913, 'K-Nearest Neighbors': 0.930635838150289, 'Support Vector Machine': 0.930635838150289}
Before Encoding Label Distribution:
 unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64
After Encoding Label Distribution:
 unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64
Confusion Matrices:
 {'Random Forest': array([[ 74,   1,   2,   0],
       [  0,  14,   0,   0],
       [  1,   0, 241,   0],
       [  1,   0,   0,  12]]), 'Decision Tree': array([[ 76,   1,   0,   0],
       [  0,  14,   0,   0],
       [  1,   0, 241,   0],
       [  1,   0,   0,  12]]), 'Logistic Regression': array([[  6,   0,  63,   8],
       [  1,   0,  13,   0],
       [ 13,   0, 228,   1],
       [  3,   0,   8,   2]]), 'K-Nearest Neighbors': array([[ 66,   0,  11, 

  df.fillna(method='ffill', inplace=True)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# 1️⃣ 데이터 로딩
df = pd.read_csv("abalone.csv")

# 2️⃣ 데이터 전처리 (성별 Encoding)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

# 3️⃣ 독립 변수(X)와 종속 변수(y) 설정
X = df.drop(columns=['Rings'])  # 입력 데이터
y = df['Rings']  # 예측할 타겟 값

# 4️⃣ Train-Test 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5️⃣ 여러 회귀 모델 선언
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector Machine": SVR(kernel='linear')
}

# 6️⃣ 모델 학습 & 평가
for name, model in models.items():
    model.fit(X_train, y_train)  # 모델 학습
    y_pred = model.predict(X_test)  # 예측 수행
    
    # MSE 및 R² Score 계산
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} - MSE: {mse:.4f}, R² Score: {r2:.4f}")

Linear Regression - MSE: 4.9603, R² Score: 0.5418
Decision Tree - MSE: 6.6388, R² Score: 0.3867
Random Forest - MSE: 3.7356, R² Score: 0.6549


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report
import pandas as pd
import numpy as np

# 데이터 로딩
df = pd.read_csv("abalone.csv")

# id는 학습에 불필요하므로 제거
df = df.drop(columns=["id"])

# 입력 변수(X), 출력 변수(y)
X = df.drop(columns=["Rings"])
y_reg = df["Rings"]  # 회귀용 타겟

# 분류용 타겟 만들기: Rings를 나이 범주로 나누기
# 예: 0~7세: young, 8~10: middle, 11+: old
def categorize_rings(rings):
    if rings <= 7:
        return 'young'
    elif rings <= 10:
        return 'middle'
    else:
        return 'old'

y_clf = df["Rings"].apply(categorize_rings)  # 분류용 타겟

In [4]:
# 훈련/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# 전처리 + 선형 회귀 파이프라인
regression_pipeline = Pipeline(steps=[
    ("preprocess", ColumnTransformer(transformers=[
        ("onehot", OneHotEncoder(), ["Sex"])
    ], remainder="passthrough")),
    ("regressor", LinearRegression())
])

# 학습
regression_pipeline.fit(X_train, y_train)

# 예측 및 평가
y_pred = regression_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("✅ 회귀 RMSE (Root Mean Squared Error):", rmse)

✅ 회귀 RMSE (Root Mean Squared Error): 2.2116130871218322


In [5]:
# 훈련/테스트 분리
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)

# 전처리 + 분류 파이프라인
classification_pipeline = Pipeline(steps=[
    ("preprocess", ColumnTransformer(transformers=[
        ("onehot", OneHotEncoder(), ["Sex"])
    ], remainder="passthrough")),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# 학습
classification_pipeline.fit(X_train_c, y_train_c)

# 예측 및 평가
y_pred_c = classification_pipeline.predict(X_test_c)
print("✅ 분류 Classification Report:\n", classification_report(y_test_c, y_pred_c))

✅ 분류 Classification Report:
               precision    recall  f1-score   support

      middle       0.66      0.69      0.68       380
         old       0.66      0.67      0.67       276
       young       0.80      0.71      0.75       180

    accuracy                           0.69       836
   macro avg       0.71      0.69      0.70       836
weighted avg       0.69      0.69      0.69       836



In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# 데이터 로딩
df = pd.read_csv("car_evaluation.csv")

# 컬럼 이름 정리
df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# 입력(X), 출력(y) 분리
X = df.drop(columns=["class"])
y_clf = df["class"]  # 분류용 라벨

# 회귀용 타겟을 위해 class를 숫자로 인코딩
label_encoder = LabelEncoder()
y_reg = label_encoder.fit_transform(y_clf)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y_clf, test_size=0.2, random_state=42)

# One-hot 인코딩 + 랜덤 포레스트 파이프라인
clf_pipeline = Pipeline(steps=[
    ("encoder", ColumnTransformer([
        ("onehot", OneHotEncoder(), X.columns)
    ], remainder='passthrough')),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# 학습
clf_pipeline.fit(X_train, y_train)

# 예측 및 평가
y_pred = clf_pipeline.predict(X_test)
print("✅ 분류 결과:\n", classification_report(y_test, y_pred))

✅ 분류 결과:
               precision    recall  f1-score   support

         acc       0.91      0.95      0.93        77
        good       0.75      0.60      0.67        15
       unacc       1.00      0.99      1.00       237
       vgood       0.68      0.76      0.72        17

    accuracy                           0.95       346
   macro avg       0.84      0.83      0.83       346
weighted avg       0.95      0.95      0.95       346



In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 학습/테스트 분리
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# One-hot + 선형 회귀 파이프라인
reg_pipeline = Pipeline(steps=[
    ("encoder", ColumnTransformer([
        ("onehot", OneHotEncoder(), X.columns)
    ], remainder='passthrough')),
    ("regressor", LinearRegression())
])

# 학습
reg_pipeline.fit(X_train_r, y_train_r)

# 예측 및 평가
y_pred_r = reg_pipeline.predict(X_test_r)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_r))
print("✅ 회귀 RMSE:", rmse)

# 참고: 숫자 예측 결과를 다시 class로 디코딩하려면 아래처럼 가능
decoded = label_encoder.inverse_transform(np.round(y_pred_r).astype(int))

✅ 회귀 RMSE: 0.7719579196444836
