### 用死傷人數做 OLS 模型

In [43]:
import pandas as pd
import numpy as np

# 讀取 A1 與 A2 資料
a1 = pd.read_csv("data/A1.csv")

# 讀取 A2_1 到 A2_12 資料
a2_list = []
for i in range(1, 13):
    df = pd.read_csv(f"data/A2_{i}.csv")
    a2_list.append(df)

# 顯示 A1 和前兩個 A2 資料的前幾列
a1.head()
a2_list[0].head()
a2_list[1].head()


df = pd.concat([a1] + a2_list, ignore_index=True)
df = df[df["當事者順位"] == 1]


# 提取 "死亡" 後的數字並轉為 int，再加總
df["死亡人數"] = df["死亡受傷人數"].str.extract(r"死亡(\d+)")[0].astype(int)
df["受傷人數"] = df["死亡受傷人數"].str.extract(r"受傷(\d+)")[0].astype(int)
df["死傷人數"] = df["死亡人數"] + df["受傷人數"]

# 處理年齡
df = df[(df["當事者事故發生時年齡"] > 0) & (df["當事者事故發生時年齡"] < 100)]
df = df[df["當事者事故發生時年齡"] != -1]  # drop age = -1

print(df["死亡人數"].value_counts())
print(df["受傷人數"].value_counts())
print(df["死傷人數"].value_counts())


  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")
  df = pd.read_csv(f"data/A2_{i}.csv")


死亡人數
0    396252
1      1708
2        37
3         3
4         1
Name: count, dtype: int64
受傷人數
1     280533
2      98495
3      14825
4       2199
0       1232
5        444
6        148
7         58
8         34
9         11
10         6
11         5
12         4
13         2
19         1
22         1
14         1
40         1
15         1
Name: count, dtype: int64
死傷人數
1     281356
2      98815
3      14882
4       2215
5        453
6        148
7         63
8         32
9         14
10         6
11         5
13         3
12         3
19         1
0          1
26         1
14         1
41         1
15         1
Name: count, dtype: int64


In [44]:
import statsmodels.api as sm

# 取出自變數和應變數
X = df["當事者事故發生時年齡"]
y = df["死傷人數"]

# 加入截距項
X = sm.add_constant(X)

# 建立並擬合 OLS 模型
model = sm.OLS(y, X).fit()

# 顯示回歸摘要
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   死傷人數   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     887.9
Date:                Fri, 23 May 2025   Prob (F-statistic):          7.05e-195
Time:                        18:17:22   Log-Likelihood:            -3.7046e+05
No. Observations:              398001   AIC:                         7.409e+05
Df Residuals:                  397999   BIC:                         7.410e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4176      0.003    565.157      0.0

### 受傷人數與

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# 建立 y 與天候 dummy
df = df.dropna(subset=["事故類別名稱", "天候名稱", "當事者屬-性-別名稱", "當事者事故發生時年齡"])
df = df[(df["當事者事故發生時年齡"] > 0) & (df["當事者事故發生時年齡"] < 100)]
df = df[df["當事者事故發生時年齡"] != -1]  # drop age = -1

# 把夜間無照明合併進無照明
df["光線名稱"] = df["光線名稱"].replace("夜間(或隧道、地下道、涵洞)無照明", "無照明")
df["光線名稱"] = df["光線名稱"].replace({"有照明未開啟或故障": "無照明", "晨或暮光": "無照明", "有照明且開啟": "有照明"})
df["光線名稱"] = df["光線名稱"].replace("日間自然光線", "有照明")
df["光線名稱"] = df["光線名稱"].replace("夜間(或隧道、地下道、涵洞)有照明", "有照明")

light_dummies = pd.get_dummies(df["光線名稱"], prefix="light", drop_first=True)
print(df["光線名稱"].value_counts())

df["天候名稱"] = df["天候名稱"].replace({"風": "雨", "陰": "晴"})
df["天候名稱"] = df["天候名稱"].replace("暴雨", "雨")
df["天候名稱"] = df["天候名稱"].replace({"雪": "雨", "強風": "雨", "風沙": "雨", "霧或煙": "雨"})

weather_dummies = pd.get_dummies(df["天候名稱"], prefix="weather", drop_first=True)
# 若你想要 drop 少數類別可以加這行（例如雪、風）


print(df["天候名稱"].value_counts())

# 替換狀態為「濕潤」群組
df["路面狀況-路面狀態名稱"] = df["路面狀況-路面狀態名稱"].replace({
    "油滑": "濕潤",
    "泥濘": "濕潤",
    "冰雪": "濕潤"
})

# 產生所有 dummy（不 drop）
road_dummies = pd.get_dummies(df["路面狀況-路面狀態名稱"], prefix="road", drop_first=True)



age = df["當事者事故發生時年齡"]


# 建立模型資料集
df_model = pd.concat([df["受傷人數"], age.rename("age"), light_dummies, weather_dummies], axis=1)

# 自動產生公式
predictors = df_model.columns.difference(["受傷人數"])
formula = "受傷人數 ~ " + " + ".join(predictors)

# 建模
model = smf.ols(formula, data=df_model)
result = model.fit()

# 顯示結果
print(result.summary())


光線名稱
有照明    264181
無照明    133820
Name: count, dtype: int64
天候名稱
晴    366361
雨     31640
Name: count, dtype: int64
                            OLS Regression Results                            
Dep. Variable:                   受傷人數   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     503.7
Date:                Fri, 23 May 2025   Prob (F-statistic):               0.00
Time:                        18:19:05   Log-Likelihood:            -3.7157e+05
No. Observations:              398001   AIC:                         7.432e+05
Df Residuals:                  397997   BIC:                         7.432e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
----------

### 肇事人年齡與路況、光線的關係


無照明 -> 老人反應不夠快

濕潤 -> 年輕人騎比較快比較會出事

In [50]:

# 建立模型資料集
df_model = pd.concat([age.rename("age"), road_dummies, light_dummies], axis=1)

# 自動產生公式
predictors = df_model.columns.difference(["age"])
formula = "age ~ " + " + ".join(predictors)

# 建模
model = smf.ols(formula, data=df_model)
result = model.fit()

# 顯示結果
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                    age   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     752.1
Date:                Fri, 23 May 2025   Prob (F-statistic):               0.00
Time:                        18:20:29   Log-Likelihood:            -1.7118e+06
No. Observations:              398001   AIC:                         3.424e+06
Df Residuals:                  397998   BIC:                         3.424e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            42.0363      0.03