In [1]:
# 安裝需要的套件（Colab 預設已裝大部分）
!pip install pandas scikit-learn matplotlib seaborn

# 匯入套件
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# === 載入 CSV 資料（自行上傳或連接雲端硬碟） ===
# 建議將以下 CSV 上傳到 Colab 或從 Google Drive 讀取

# 以範例檔名載入
df_demo = pd.read_csv("microbiology_cultures_demographics.csv")
df_ward = pd.read_csv("microbiology_cultures_ward_info.csv")
df_nursing = pd.read_csv("microbiology_cultures_nursing_home_visits.csv")
# 若有 adi_scores.csv 可加入，但記得先處理 Null，否則記憶體大增

# === 合併資料（記憶體優化建議使用前 10000 筆） ===
df_merge = df_demo.merge(df_ward, on=["anon_id", "order_proc_id_coded"], how="inner")
df_merge = df_merge.merge(df_nursing, on=["anon_id", "order_proc_id_coded"], how="left")

# 移除缺失資料
df_clean = df_merge.dropna(subset=["age_bin", "sex", "ward_type"])

# 選擇前 10000 筆進行模擬分析
df_clean = df_clean.head(10000)

# 模擬抗藥性標籤（實務中請換成實際抗藥性結果如 Ciprofloxacin resistant）
np.random.seed(0)
df_clean["resistant"] = np.random.choice([0, 1], size=len(df_clean), p=[0.7, 0.3])

# 處理類別型變數（one-hot encoding）
df_encoded = pd.get_dummies(df_clean[["age_bin", "sex", "ward_type"]], drop_first=True)

# 定義特徵與標籤
X = df_encoded
y = df_clean["resistant"]

# 資料切分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特徵標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 建立邏輯迴歸模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# 模型預測與報告
y_pred = model.predict(X_test_scaled)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Feature importance（以係數絕對值表示）
importance = pd.Series(model.coef_[0], index=X.columns).sort_values(key=abs, ascending=False)

# 畫出圖表
plt.figure(figsize=(10, 6))
sns.barplot(x=importance.values, y=importance.index)
plt.title("Logistic Regression Feature Importance (Risk Factors)")
plt.xlabel("Coefficient Magnitude")
plt.grid(True)
plt.tight_layout()
plt.show()



FileNotFoundError: [Errno 2] No such file or directory: 'microbiology_cultures_demographics.csv'