In [1]:
#데이터 불러오기
import pandas as pd

url = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"
df = pd.read_csv(url)
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [2]:
#결측치 처리
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [3]:
#엔코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['sex'] = le.fit_transform(df['sex'])       # female=0, male=1
df['smoker'] = le.fit_transform(df['smoker'])  # no=0, yes=1
df['region'] = le.fit_transform(df['region'])  # northeast=0, northwest=1, southeast=2, southwest=3

In [4]:
print(df.head())

   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520


In [5]:
# charges의 중간값(median)을 기준으로 고비용/저비용 분류
median_charge = df['charges'].median()
df['high_cost'] = (df['charges'] >= median_charge).astype(int)

print(df[['charges', 'high_cost']].head())

       charges  high_cost
0  16884.92400          1
1   1725.55230          0
2   4449.46200          0
3  21984.47061          1
4   3866.85520          0


In [None]:
# #Step 5: 분류 문제용 데이터 분리(train/test)
# 	•	X (입력): charges, high_cost를 제외한 나머지 특성들
# 	•	y (정답): high_cost

from sklearn.model_selection import train_test_split

X = df.drop(['charges', 'high_cost'], axis=1)
y = df['high_cost']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import torch

# 입력(X)은 float32 타입
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)

# 출력(y)은 float32 타입 + (2D로 reshape)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [8]:
import torch.nn as nn

class ClassificationModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 16)  # 은닉층: 16개 노드
        self.fc2 = nn.Linear(16, 1)                 # 출력층: 1개 노드
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

model = ClassificationModel()

In [None]:
# Step 8: 학습 준비 (손실 함수 + 옵티마이저 설정)
import torch.optim as optim

criterion = nn.BCELoss()  # 이진 분류 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001)  # 옵티마이저 설정

In [11]:
epochs = 100

for epoch in range(epochs):
    model.train()
    
    # 순전파
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # 역전파
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # 10 에포크마다 출력
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.8848
Epoch [20/100], Loss: 0.5995
Epoch [30/100], Loss: 0.6077
Epoch [40/100], Loss: 0.6003
Epoch [50/100], Loss: 0.5847
Epoch [60/100], Loss: 0.5833
Epoch [70/100], Loss: 0.5799
Epoch [80/100], Loss: 0.5779
Epoch [90/100], Loss: 0.5760
Epoch [100/100], Loss: 0.5740


In [12]:
# Step 10: 모델 평가 (테스트 데이터 정확도 확인)
model.eval()
with torch.no_grad():
    preds = model(X_test)
    preds_class = (preds >= 0.5).float()  # 0.5 기준으로 분류
    
    correct = (preds_class == y_test).sum().item()
    acc = correct / y_test.size(0)
    
    print(f'Test Accuracy: {acc:.4f}')

Test Accuracy: 0.7463


In [13]:
# high_cost 컬럼 제거하고 회귀용 데이터셋 준비
X = df.drop(['charges', 'high_cost'], axis=1)
y = df['charges']

from sklearn.model_selection import train_test_split

# 다시 train/test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
import torch

X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [15]:
import torch.nn as nn

class RegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 16)  # 은닉층
        self.fc2 = nn.Linear(16, 1)                  # 출력층
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = RegressionModel()

In [None]:
# 📍 Step 4: 손실 함수 및 옵티마이저 설정 (회귀용)
import torch.optim as optim

criterion = nn.MSELoss()  # 회귀용 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam 옵티마이저

In [17]:
epochs = 300

for epoch in range(epochs):
    model.train()
    
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 30 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [30/300], Loss: 322380352.0000
Epoch [60/300], Loss: 322209120.0000
Epoch [90/300], Loss: 321978336.0000
Epoch [120/300], Loss: 321676672.0000
Epoch [150/300], Loss: 321303648.0000
Epoch [180/300], Loss: 320853056.0000
Epoch [210/300], Loss: 320313408.0000
Epoch [240/300], Loss: 319678944.0000
Epoch [270/300], Loss: 318950144.0000
Epoch [300/300], Loss: 318129120.0000


In [None]:
# 손실함수 MSE 
model.eval()
with torch.no_grad():
    preds = model(X_test)
    mse = criterion(preds, y_test)
    print(f'Test MSE: {mse.item():.4f}')

Test MSE: 319133056.0000


In [19]:
model.eval()
with torch.no_grad():
    preds = model(X_test)
    mse = criterion(preds, y_test)
    print(f'Test MSE: {mse.item():.4f}')

Test MSE: 319133056.0000


In [None]:
#  R2 Score 평가
from sklearn.metrics import r2_score

# numpy로 변환해서 r2_score 계산
r2 = r2_score(y_test.numpy(), preds.numpy())
print(f'Test R2 Score: {r2:.4f}')

Test R2 Score: -1.0556


In [22]:
import pandas as pd

# 데이터 읽기
df = pd.read_csv('BP_data.csv')

# 데이터 구조 보기
print(df.head())


   Patient_Number  Blood_Pressure_Abnormality  Level_of_Hemoglobin  \
0               1                           1                11.28   
1               2                           0                 9.75   
2               3                           1                10.79   
3               4                           0                11.00   
4               5                           1                14.17   

   Genetic_Pedigree_Coefficient  Age  BMI  Sex  Pregnancy  Smoking  \
0                          0.90   34   23    1        1.0        0   
1                          0.23   54   33    1        NaN        0   
2                          0.91   70   49    0        NaN        0   
3                          0.43   71   50    0        NaN        0   
4                          0.83   52   19    0        NaN        0   

   Physical_activity  salt_content_in_the_diet  alcohol_consumption_per_day  \
0              45961                     48071                          NaN   


In [23]:
print(df.isnull().sum())

Patient_Number                      0
Blood_Pressure_Abnormality          0
Level_of_Hemoglobin                 0
Genetic_Pedigree_Coefficient       92
Age                                 0
BMI                                 0
Sex                                 0
Pregnancy                        1558
Smoking                             0
Physical_activity                   0
salt_content_in_the_diet            0
alcohol_consumption_per_day       242
Level_of_Stress                     0
Chronic_kidney_disease              0
Adrenal_and_thyroid_disorders       0
dtype: int64


In [24]:
# 1. Pregnancy 컬럼 삭제
df = df.drop(columns=['Pregnancy'])

# 2. 나머지 NaN 평균값으로 채우기
df['Genetic_Pedigree_Coefficient'] = df['Genetic_Pedigree_Coefficient'].fillna(df['Genetic_Pedigree_Coefficient'].mean())
df['alcohol_consumption_per_day'] = df['alcohol_consumption_per_day'].fillna(df['alcohol_consumption_per_day'].mean())

# NaN이 제대로 처리됐는지 확인
print(df.isnull().sum())

Patient_Number                   0
Blood_Pressure_Abnormality       0
Level_of_Hemoglobin              0
Genetic_Pedigree_Coefficient     0
Age                              0
BMI                              0
Sex                              0
Smoking                          0
Physical_activity                0
salt_content_in_the_diet         0
alcohol_consumption_per_day      0
Level_of_Stress                  0
Chronic_kidney_disease           0
Adrenal_and_thyroid_disorders    0
dtype: int64


In [25]:
df = df.drop(columns=['Patient_Number'])

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 데이터 불러오기
df = pd.read_csv('BP_data.csv')

# 1. NaN 처리
df = df.drop(columns=['Pregnancy'])  # Pregnancy 컬럼 삭제
df['Genetic_Pedigree_Coefficient'] = df['Genetic_Pedigree_Coefficient'].fillna(df['Genetic_Pedigree_Coefficient'].mean())
df['alcohol_consumption_per_day'] = df['alcohol_consumption_per_day'].fillna(df['alcohol_consumption_per_day'].mean())

# 2. 불필요 컬럼 삭제
df = df.drop(columns=['Patient_Number'])

# 3. 데이터 타입 점검 (생략 가능 - 다 숫자형)

# 4. 이상치 점검 (선택, 이번에는 생략)

# 5. 타겟 분리
X = df.drop('Blood_Pressure_Abnormality', axis=1)
y = df['Blood_Pressure_Abnormality']

# 6. 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# 1. 데이터 불러오기
df = pd.read_csv('파일명.csv')

# 2. NaN 확인
print(df.isnull().sum())

# 3. NaN 처리
df = df.drop(columns=['삭제할컬럼명'])
df['컬럼명'] = df['컬럼명'].fillna(df['컬럼명'].mean())

# 4. 불필요한 컬럼 삭제
df = df.drop(columns=['Patient_Number'])

# 5. 데이터 타입 점검 (필요시 Label Encoding)
# le = LabelEncoder()
# df['컬럼명'] = le.fit_transform(df['컬럼명'])

# 6. X, y 분리
X = df.drop('타겟컬럼명', axis=1)
y = df['타겟컬럼명']

# 7. 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 8. train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

FileNotFoundError: [Errno 2] No such file or directory: '파일명.csv'