## 2주차 실습

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# 데이터 로드
file_path = "C:/AI/2week/mobile.csv"
mobile_df = pd.read_csv(file_path)

# 클래스(Label) 변환
X_new = mobile_df.drop(columns=["price_range"])  # 특성 데이터
y_new = LabelEncoder().fit_transform(mobile_df["price_range"])  # 문자열 레이블을 숫자로 변환

# 데이터 분할 (80% 학습, 20% 테스트)
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.2, random_state=42, stratify=y_new
)

# 표준화 (SVM과 로지스틱 회귀를 위해)
scaler = StandardScaler()
X_train_scaled_new = scaler.fit_transform(X_train_new)
X_test_scaled_new = scaler.transform(X_test_new)

In [8]:
# DT 모델 학습 및 평가
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_new, y_train_new)
dt_pred_new = dt_model.predict(X_test_new)
print("\nDecision Tree Classification Report:")
print(classification_report(y_test_new, dt_pred_new))
print(f"Decision Tree Accuracy: {accuracy_score(y_test_new, dt_pred_new):.2%}")


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       100
           1       0.78      0.74      0.76       100
           2       0.75      0.80      0.77       100
           3       0.92      0.86      0.89       100

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

Decision Tree Accuracy: 83.00%


In [10]:
# RF 모델 학습 및 평가
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_new, y_train_new)
rf_pred_new = rf_model.predict(X_test_new)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test_new, rf_pred_new))
print(f"Random Forest Accuracy: {accuracy_score(y_test_new, rf_pred_new):.2%}")


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       100
           1       0.82      0.84      0.83       100
           2       0.81      0.79      0.80       100
           3       0.93      0.93      0.93       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400

Random Forest Accuracy: 88.00%


In [12]:
# SVM 모델 학습 및 평가
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled_new, y_train_new)
svm_pred_new = svm_model.predict(X_test_scaled_new)
print("\nSVM Classification Report:")
print(classification_report(y_test_new, svm_pred_new))
print(f"SVM Accuracy: {accuracy_score(y_test_new, svm_pred_new):.2%}")


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       100
           1       0.95      0.95      0.95       100
           2       0.95      0.95      0.95       100
           3       0.96      0.99      0.98       100

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400

SVM Accuracy: 96.25%


In [14]:
# LR 모델 학습 및 평가
lr_model = LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train_scaled_new, y_train_new)
lr_pred_new = lr_model.predict(X_test_scaled_new)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test_new, lr_pred_new))
print(f"Logistic Regression Accuracy: {accuracy_score(y_test_new, lr_pred_new):.2%}")


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       100
           1       0.96      0.96      0.96       100
           2       0.95      0.94      0.94       100
           3       0.96      0.98      0.97       100

    accuracy                           0.96       400
   macro avg       0.97      0.96      0.96       400
weighted avg       0.97      0.96      0.96       400

Logistic Regression Accuracy: 96.50%


## 데이터프레임으로 가져오기

In [48]:
# 웹에서 raw 파일로 다운하여 가져오기

import pandas as pd

# 데이터 로드
file_path = "C:/AI/2week/mobile.csv"

# CSV 파일 불러오기
df = pd.read_csv(file_path)

# 데이터프레임 확인
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [22]:
# 웹에서 url주소로 가져오기

import pandas as pd

# GitHub의 Raw CSV 파일 URL
url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/mobile.csv"

# CSV 파일을 데이터프레임으로 불러오기
df = pd.read_csv(url)

# 데이터프레임 확인
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [28]:
# 웹에서 raw 파일로 다운하여 가져오기(excel)

import pandas as pd

# 파일 경로 지정
file_path = "C:/AI/2week/combined_dataset-1.xlsx"

# 엑셀 파일 불러오기
df = pd.read_excel(file_path, engine='openpyxl')

# 데이터 확인
df.head()

Unnamed: 0,Num.,subject_ID,Sex(M/F),Age(year),Height(cm),Weight(kg),Systolic Blood Pressure(mmHg),Diastolic Blood Pressure(mmHg),Heart Rate(b/m),BMI(kg/m^2),...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,1,2,Female,45,152,63,161,89,97,27.268006,...,1766,1766,1766,1833,1833,1827,1827,1827,1754,1754
1,1,2,Female,45,152,63,161,89,97,27.268006,...,1985,1985,2026,2026,2026,1977,1977,1997,1997,1997
2,1,2,Female,45,152,63,161,89,97,27.268006,...,1942,1900,1900,1938,1938,1938,1924,1924,1929,1929
3,2,3,Female,50,157,50,160,93,76,20.284799,...,2073,2072,2072,2072,2051,2051,2036,2036,2036,2045
4,2,3,Female,50,157,50,160,93,76,20.284799,...,2021,2010,2010,2010,2001,2001,2003,2003,2003,1989


In [26]:
# 웹에서 url 주소로 가져오기(excel)

import pandas as pd

# GitHub Raw 엑셀 파일 URL
url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/combined_dataset-1.xlsx"

# 엑셀 파일 불러오기
df = pd.read_excel(url, engine='openpyxl')  # openpyxl 엔진 사용

# 데이터 확인
df.head()

Unnamed: 0,Num.,subject_ID,Sex(M/F),Age(year),Height(cm),Weight(kg),Systolic Blood Pressure(mmHg),Diastolic Blood Pressure(mmHg),Heart Rate(b/m),BMI(kg/m^2),...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,1,2,Female,45,152,63,161,89,97,27.268006,...,1766,1766,1766,1833,1833,1827,1827,1827,1754,1754
1,1,2,Female,45,152,63,161,89,97,27.268006,...,1985,1985,2026,2026,2026,1977,1977,1997,1997,1997
2,1,2,Female,45,152,63,161,89,97,27.268006,...,1942,1900,1900,1938,1938,1938,1924,1924,1929,1929
3,2,3,Female,50,157,50,160,93,76,20.284799,...,2073,2072,2072,2072,2051,2051,2036,2036,2036,2045
4,2,3,Female,50,157,50,160,93,76,20.284799,...,2021,2010,2010,2010,2001,2001,2003,2003,2003,1989


## KNN을 이용한 붓꽃 데이터 분류

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

In [46]:
# 데이터 로드
file_path = "C:/AI/1week/iris.csv"
df = pd.read_csv(file_path)

# 1. 데이터 준비 (특성과 레이블 분리)
X = df.iloc[:, :-1].values  # 마지막 열을 제외한 나머지를 X (특성)
y = df.iloc[:, -1].values   # 마지막 열을 y (클래스)

# 2. 훈련 데이터와 테스트 데이터로 나누기 (8:2 비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. KNN 모델 생성 및 학습 (k=3 사용)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# 4. 테스트 데이터 예측
y_pred = knn.predict(X_test)

# 5. 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN 모델 정확도: {accuracy * 100:.2f}%")

KNN 모델 정확도: 100.00%
