## 構造化データの分類問題

### 数値特徴量

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from typing import Tuple
from sklearn.metrics import precision_score, recall_score

# データを読み込む
train_data = pd.read_csv('train_num.csv')
validation_data = pd.read_csv('validation_num.csv')

# 説明変数と目的変数を分離する
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']
X_val = validation_data.drop(columns=['target'])
y_val = validation_data['target']

# モデルを訓練する
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# バリデーションデータで予測を行う
y_pred = clf.predict(X_val)


def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> dict[str, float]:
    predictions, labels = eval_pred
    predictions = np.sign(predictions)
    precision = precision_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください
    recall = recall_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy, "precision": precision, "recall": recall}

metrics_dict = compute_metrics((y_pred, y_val))
accuracy = metrics_dict["accuracy"]
precision = metrics_dict["precision"]
recall = metrics_dict["recall"]

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)



Accuracy: 0.6362252663622526
Precision: 0.6362581174284635
Recall: 0.636264431330845


### ダミー変数化

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from typing import Tuple
from sklearn.metrics import precision_score, recall_score

# データを読み込む
train_data = pd.read_csv('train_num.csv')
validation_data = pd.read_csv('validation_num.csv')

# 特徴量を選択
categorical_features = ['天候', '路面状態', '道路形状', '信号機', '車道幅員', '道路線形', '衝突地点']

# 訓練データとバリデーションデータを結合
combined_data = pd.concat([train_data, validation_data], ignore_index=True)

# 訓練データとバリデーションデータを含めてダミー変数化
combined_data = pd.get_dummies(combined_data, columns=categorical_features, drop_first=True)

# 訓練データとバリデーションデータに再分割
X_train = combined_data[:len(train_data)]
X_val = combined_data[len(train_data):]

# 説明変数からtargetを削除
X_train = X_train.drop(columns=['target'])
X_val = X_val.drop(columns=['target'])

# 目的変数を分離
y_train = train_data['target']
y_val = validation_data['target']

# モデルを訓練する
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# バリデーションデータで予測を行う
y_pred = clf.predict(X_val)

def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> dict[str, float]:
    predictions, labels = eval_pred
    predictions = np.sign(predictions)
    precision = precision_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください
    recall = recall_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy, "precision": precision, "recall": recall}

metrics_dict = compute_metrics((y_pred, y_val))
accuracy = metrics_dict["accuracy"]
precision = metrics_dict["precision"]
recall = metrics_dict["recall"]

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.6255707762557078
Precision: 0.6255707762557077
Recall: 0.6255812498696749


{'accuracy': 0.6362252663622526}


In [None]:
import pandas as pd
import numpy as np

# データを読み込む
train_data = pd.read_csv('train_num.csv')
validation_data = pd.read_csv('validation_num.csv')

train_data.head(30)


Unnamed: 0,発生日時月,発生日時日,発生日時時,発生日時分,天候,路面状態,道路形状,信号機,車道幅員,道路線形,衝突地点,target
0,1,17,21,59,1,1,14,7,3,9,1,0
1,12,28,17,42,2,4,1,1,15,9,30,1
2,1,6,19,30,2,2,14,7,2,9,1,1
3,1,6,11,41,5,2,14,7,2,9,1,0
4,1,25,6,4,1,1,7,7,3,9,1,0
5,12,18,12,54,1,1,1,7,14,9,30,1
6,12,30,17,40,1,1,1,7,14,9,30,1
7,12,15,21,14,1,1,1,1,19,9,30,1
8,1,8,15,52,1,1,1,1,15,9,30,1
9,1,14,12,7,1,1,14,7,2,9,1,0
