In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


file_path = "/Users/chaejinseong/aistart/WEEK1/mobile.csv"  # 본인이 iris.csv를 저장한 경로를 입력
df = pd.read_csv(file_path)

#  데이터프레임 확인
df.head()
print(df.columns)

# 특징(X)과 타겟(y) 분리
X = df.drop(columns=['price_range'])  # 'Name'이 정답(label) 컬럼일 가능성이 높음
y = df['price_range']

# 문자열 라벨을 숫자로 변환
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# 데이터 분할 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree 모델 학습 및 평가
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

# Random Forest 모델 학습 및 평가
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# SVM 모델 학습 및 평가
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# Logistic Regression 모델 학습 및 평가
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")


Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')
Decision Tree Accuracy: 0.8350
Random Forest Accuracy: 0.8700
SVM Accuracy: 0.9650
Logistic Regression Accuracy: 0.6325


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 파일 경로 (사용자에 맞게 수정 필요)
file_path = "mobile.csv"  # 로컬 파일 경로 지정

df = pd.read_csv(file_path)

# 특징(X)과 타겟(y) 분리
X = df.drop(columns=['price_range'])  # price_range가 레이블
y = df['price_range']

# 데이터 분할 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 정규화 적용 (SVM과 Logistic Regression 성능 향상)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 초기화
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', C=1.0),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# 모델 학습 및 평가
results = {}
for name, model in models.items():
    if name in ["SVM", "Logistic Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# 최종 결과 출력
print("\n최종 모델 성능:")
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")

print("y_test = " , y_test)

Decision Tree Accuracy: 0.8200
Random Forest Accuracy: 0.8800
SVM Accuracy: 0.8950
Logistic Regression Accuracy: 0.9650

최종 모델 성능:
Decision Tree: 0.8200
Random Forest: 0.8800
SVM: 0.8950
Logistic Regression: 0.9650
y_test =  501     3
1924    1
1710    0
547     2
496     3
       ..
1204    1
1937    2
412     1
243     2
724     1
Name: price_range, Length: 400, dtype: int64


In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
file_path = "titanic.csv"  # 로컬 파일 경로 지정

df = pd.read_csv(file_path)
print(df.columns)

df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

# Label Encoding 적용할 컬럼
cols_to_encode = ["Sex", "Embarked"]

# Label Encoding 실행
label_encoders = {}
for col in cols_to_encode:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))  # NaN을 문자열로 변환 후 처리

# 최종 데이터 확인
print(df.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2


In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/titanic.csv"
df = pd.read_csv(url)
print(df.columns)

df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

# Label Encoding 적용할 컬럼
cols_to_encode = ["Sex", "Embarked"]

# Label Encoding 실행
label_encoders = {}
for col in cols_to_encode:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))  # NaN을 문자열로 변환 후 처리

# 최종 데이터 확인
print(df.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

file_path = "titanic.csv"  # 로컬 파일 경로 지정

df = pd.read_csv(file_path)
print(df.columns)

# 결측치 확인 및 처리
df['Age'].fillna(df['Age'].mean(), inplace=True)
df.drop(columns=['Cabin'], inplace=True)
df.dropna(inplace=True)

# 불필요한 컬럼 제거
df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# 엔코딩 (Sex, Embarked 변환)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# 변환 전 레이블 분포
before_encoding = df['Survived'].value_counts()

# 데이터 분할
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 모델 리스트
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# 모델 학습 및 평가
results = {}
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    results[name] = accuracy
    conf_matrices[name] = conf_matrix

# 변환 후 레이블 분포
after_encoding = df['Survived'].value_counts()

# 결과 출력
print("Accuracy Results:", results)
print("Before Encoding Label Distribution:\n", before_encoding)
print("After Encoding Label Distribution:\n", after_encoding)
print("Confusion Matrices:\n", conf_matrices)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Accuracy Results: {'Random Forest': 0.7921348314606742, 'Decision Tree': 0.7471910112359551, 'Logistic Regression': 0.8146067415730337, 'K-Nearest Neighbors': 0.6629213483146067, 'Support Vector Machine': 0.6573033707865169}
Before Encoding Label Distribution:
 Survived
0    549
1    340
Name: count, dtype: int64
After Encoding Label Distribution:
 Survived
0    549
1    340
Name: count, dtype: int64
Confusion Matrices:
 {'Random Forest': array([[94, 16],
       [21, 47]]), 'Decision Tree': array([[86, 24],
       [21, 47]]), 'Logistic Regression': array([[98, 12],
       [21, 47]]), 'K-Nearest Neighbors': array([[85, 25],
       [35, 33]]), 'Support Vector Machine': array([[97, 13],
       [48, 20]])}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
