# 대구 교통사고 피해 예측 AI 경진대회
- 문제 링크: https://dacon.io/competitions/official/236193/data
- 제출 기한: 2023.11.15 ~ 2023.12.11 09:59

# 1. 라이브러리 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from torch import nn
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import utils
from torchsummary import summary
import torchvision.transforms as T
from sklearn.model_selection import train_test_split
from PIL import Image, ImageDraw, ImageFont
from glob import glob
import matplotlib.pyplot as plt
import cv2
import os
import copy
import numpy as np
import pandas as pd
import random
import geopandas as gpd
from tqdm import tqdm

# 2. 데이터 전처리
아이디어들
- 0은 순전파/역전파에서 아예 다음 노드로의 계산을 없애버리니까 좀 학습하기에 안 좋지 않을까 -> 자연어 처리의 희소 벡터에서 밀집 벡터로의 임베딩 처럼..
- 인간이 봐도 확실히 ECLO가 클 것 같은 유형에는 가중치를 휴리스틱하게 부여

In [3]:
files = glob('/content/drive/MyDrive/ml-class-rhseung/DACON/data/*.csv')

csv_files = { file.split('/')[-1][:-len('.csv')]: file for file in files }

csv_files

{'test': '/content/drive/MyDrive/ml-class-rhseung/DACON/data/test.csv',
 'sample_submission': '/content/drive/MyDrive/ml-class-rhseung/DACON/data/sample_submission.csv',
 'train': '/content/drive/MyDrive/ml-class-rhseung/DACON/data/train.csv'}

In [4]:
train_df = pd.read_csv(csv_files['train'])
test_df = pd.read_csv(csv_files['test'])

In [5]:
X_train_df = train_df.iloc[:, 1:-1]
Y_train_df = train_df.iloc[:, -1]

X_test_df = test_df.iloc[:, 1:]
# Y는 없음.

In [6]:
X_train_df['사고일시'] = pd.to_datetime(X_train_df['사고일시'])
X_test_df['사고일시'] = pd.to_datetime(X_test_df['사고일시'])

X_train_df['사고일시']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df['사고일시'] = pd.to_datetime(X_test_df['사고일시'])


0       2019-01-01 00:00:00
1       2019-01-01 00:00:00
2       2019-01-01 01:00:00
3       2019-01-01 02:00:00
4       2019-01-01 04:00:00
                ...        
39604   2021-12-31 19:00:00
39605   2021-12-31 19:00:00
39606   2021-12-31 21:00:00
39607   2021-12-31 22:00:00
39608   2021-12-31 23:00:00
Name: 사고일시, Length: 39609, dtype: datetime64[ns]

In [7]:
dt = X_train_df['사고일시'].dt

X_train_df['year'] = dt.year
X_train_df['month'] = dt.month
X_train_df['day'] = dt.day
X_train_df['hour'] = dt.hour

X_train_df.head()

Unnamed: 0,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,...,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,year,month,day,hour
0,2019-01-01 00:00:00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,...,70세,중상,0,1,0,0,2019,1,1,0
1,2019-01-01 00:00:00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,승용,...,61세,경상,0,0,1,0,2019,1,1,0
2,2019-01-01 01:00:00,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,승용,...,38세,경상,0,0,1,0,2019,1,1,1
3,2019-01-01 02:00:00,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,...,36세,중상,0,1,0,0,2019,1,1,2
4,2019-01-01 04:00:00,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,...,52세,경상,0,0,1,0,2019,1,1,4


In [8]:
dt = X_test_df['사고일시'].dt

X_test_df['year'] = dt.year
X_test_df['month'] = dt.month
X_test_df['day'] = dt.day
X_test_df['hour'] = dt.hour

X_test_df.head()

Unnamed: 0,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,year,month,day,hour
0,2022-01-01 01:00:00,토요일,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람,2022,1,1,1
1,2022-01-01 01:00:00,토요일,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람,2022,1,1,1
2,2022-01-01 04:00:00,토요일,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차,2022,1,1,4
3,2022-01-01 04:00:00,토요일,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차,2022,1,1,4
4,2022-01-01 06:00:00,토요일,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차,2022,1,1,6


In [9]:
X_test_df.drop(columns=['사고일시', '요일', '시군구'], inplace=True)
X_train_df = X_train_df[X_test_df.columns]

In [10]:
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   기상상태    39609 non-null  object
 1   도로형태    39609 non-null  object
 2   노면상태    39609 non-null  object
 3   사고유형    39609 non-null  object
 4   year    39609 non-null  int64 
 5   month   39609 non-null  int64 
 6   day     39609 non-null  int64 
 7   hour    39609 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 2.4+ MB


In [11]:
X_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10963 entries, 0 to 10962
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   기상상태    10963 non-null  object
 1   도로형태    10963 non-null  object
 2   노면상태    10963 non-null  object
 3   사고유형    10963 non-null  object
 4   year    10963 non-null  int64 
 5   month   10963 non-null  int64 
 6   day     10963 non-null  int64 
 7   hour    10963 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 685.3+ KB


In [12]:
X_train_df.isnull().sum()

기상상태     0
도로형태     0
노면상태     0
사고유형     0
year     0
month    0
day      0
hour     0
dtype: int64

In [13]:
X_test_df.isnull().sum()

기상상태     0
도로형태     0
노면상태     0
사고유형     0
year     0
month    0
day      0
hour     0
dtype: int64

In [14]:
Y_train_df.head()

0    5
1    3
2    3
3    5
4    3
Name: ECLO, dtype: int64

In [15]:
# ['피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도'] 의 nan -> 10 (차대사람은 더 심각한 사고일 것이므로)

X_train_df.fillna('nan', inplace=True)
X_test_df.fillna('nan', inplace=True)

In [16]:
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   기상상태    39609 non-null  object
 1   도로형태    39609 non-null  object
 2   노면상태    39609 non-null  object
 3   사고유형    39609 non-null  object
 4   year    39609 non-null  int64 
 5   month   39609 non-null  int64 
 6   day     39609 non-null  int64 
 7   hour    39609 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 2.4+ MB


In [17]:
for column in X_train_df.columns:
  print(column, ':', X_train_df[column].unique())

기상상태 : ['맑음' '흐림' '기타' '비' '안개' '눈']
도로형태 : ['단일로 - 기타' '교차로 - 교차로안' '기타 - 기타' '단일로 - 터널' '단일로 - 지하차도(도로)내' '단일로 - 교량'
 '교차로 - 교차로횡단보도내' '주차장 - 주차장' '교차로 - 교차로부근' '단일로 - 고가도로위' '미분류 - 미분류']
노면상태 : ['건조' '젖음/습기' '서리/결빙' '기타' '침수' '적설']
사고유형 : ['차대사람' '차대차' '차량단독']
year : [2019 2020 2021]
month : [ 1  2  3  4  5  6  7  8  9 10 11 12]
day : [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
hour : [ 0  1  2  4  5  6 10 11 14 18 19 20 22  7  8 13 15 16 17 23  3 12 21  9]


- 기상 상태가 안 좋을 수록 ECLO가 더 커질 것이라고 추측
- 교통 상태와 도로가 혼잡할수록 ECLO가 커지거나, 오히려 고속도로같이 차들이 빠른 경우에도 ECLO가 커질 수 있음
- 노면상태가 안 좋을 수록 ECLO가 커질 것임
- 차대사람 같은 경우가 더 심각하다고 추측
- 보도통행중 사고는 정상적인 운전에서는 발생하지 않으므로 ECLO가 더 커질 수 있음
- 안전거리미확보 사고보단 중앙선침범 사고가 더 심각할거 같은데?
- 차체가 더 클수록 ECLO가 더 클 것
- 성별에 따른 경향이 있을까?
- 나이가 너무 어리거나(<20) 너무 많을수록 사고율이 증가할 것 같음

In [18]:
mappings = []

In [19]:
# int형 컬럼 전까지

for column in X_train_df.columns[:4]:
  mapping = {}
  for i, e in enumerate(X_train_df[column].unique()):
    mapping[i + 1] = e
    mapping[e] = i + 1
  mappings.append(mapping)

In [20]:
len(mappings), type(mappings)

(4, list)

In [21]:
class ADataset(Dataset):
    def __init__(self, X_np, Y_np, device='cpu'):
        self.X_np = X_np
        self.Y_np = Y_np
        self.device = device

    def __len__(self):
        return len(self.Y_np)

    def __getitem__(self, index):
        x, y = self.X_np[index], self.Y_np[index]
        return torch.FloatTensor(x).to(self.device), torch.FloatTensor([y]).to(self.device)

In [22]:
len(X_train_df), len(Y_train_df)

(39609, 39609)

In [23]:
X = X_train_df.values
for i, mapping in enumerate(mappings):
    X[:, i] = np.vectorize(mapping.get)(X[:, i])

X = X.astype(int)

X.shape, X.dtype

((39609, 8), dtype('int64'))

In [24]:
Y = Y_train_df.values

Y.shape, Y.dtype

((39609,), dtype('int64'))

In [25]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=224, shuffle=True)

X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((31687, 8), (7922, 8), (31687,), (7922,))

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_dset = ADataset(X_train, Y_train, device)
valid_dset = ADataset(X_valid, Y_valid, device)

len(train_dset), len(valid_dset)

(31687, 7922)

In [27]:
train_dset[0]

(tensor([4.0000e+00, 1.0000e+00, 2.0000e+00, 2.0000e+00, 2.0200e+03, 7.0000e+00,
         2.5000e+01, 6.0000e+00], device='cuda:0'),
 tensor([11.], device='cuda:0'))

In [28]:
batch_size = 64
train_loader = DataLoader(train_dset, batch_size, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dset, batch_size, shuffle=True, drop_last=True)

In [68]:
model = nn.Sequential(
    nn.Linear(8, 32),
    nn.ReLU(),
    nn.Linear(32, 64),
    nn.ReLU(),
    nn.Linear(64, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
).to(device)
cost = nn.MSELoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.003)

In [69]:
summary(model, (8,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             288
              ReLU-2                   [-1, 32]               0
            Linear-3                   [-1, 64]           2,112
              ReLU-4                   [-1, 64]               0
            Linear-5                  [-1, 128]           8,320
              ReLU-6                  [-1, 128]               0
            Linear-7                   [-1, 64]           8,256
              ReLU-8                   [-1, 64]               0
            Linear-9                   [-1, 32]           2,080
             ReLU-10                   [-1, 32]               0
           Linear-11                    [-1, 1]              33
Total params: 21,089
Trainable params: 21,089
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/ba

In [70]:
x, y = next(iter(train_loader))
x.shape, y.shape, model(x).shape

(torch.Size([64, 8]), torch.Size([64, 1]), torch.Size([64, 1]))

In [71]:
model(x)

tensor([[-1.2506],
        [-1.2444],
        [-1.2777],
        [-1.2786],
        [-1.2789],
        [-1.2741],
        [-1.3131],
        [-1.2890],
        [-1.2483],
        [-1.2957],
        [-1.2678],
        [-1.3112],
        [-1.3068],
        [-1.3118],
        [-1.2472],
        [-1.2528],
        [-1.3295],
        [-1.2430],
        [-1.2522],
        [-1.2859],
        [-1.3051],
        [-1.2472],
        [-1.2987],
        [-1.2834],
        [-1.2700],
        [-1.2747],
        [-1.2458],
        [-1.2777],
        [-1.2694],
        [-1.3091],
        [-1.2723],
        [-1.2606],
        [-1.2832],
        [-1.2931],
        [-1.3033],
        [-1.2452],
        [-1.3158],
        [-1.3091],
        [-1.2770],
        [-1.2510],
        [-1.2995],
        [-1.2541],
        [-1.3037],
        [-1.2754],
        [-1.3041],
        [-1.2866],
        [-1.2451],
        [-1.3162],
        [-1.2926],
        [-1.2913],
        [-1.3146],
        [-1.3005],
        [-1.

In [72]:
epochs = 30
for epoch in range(epochs):
    with tqdm(train_loader, unit='batch') as loader:
        loader.set_description(f'Epoch {epoch+1:02d}/{epochs}')

        for x, y in loader:
            optimizer.zero_grad()

            pred = model(x)
            loss = cost(pred, y)
            loss.backward()

            loader.set_postfix(loss=loss.item())

            optimizer.step()

Epoch 01/30: 100%|██████████| 495/495 [00:04<00:00, 104.92batch/s, loss=nan]
Epoch 02/30:  15%|█▌        | 76/495 [00:00<00:04, 95.30batch/s, loss=nan]


KeyboardInterrupt: ignored

In [None]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
    X_valid_t = torch.FloatTensor(X_valid).to(device)
    Y_valid_t = torch.FloatTensor(Y_valid).view(-1, 1)

    pred = model(X_valid_t).cpu().numpy().squeeze(1)
    y = Y_valid_t.numpy().squeeze(1)

    print(np.unique(pred))

    x = np.arange(len(y))
    plt.scatter(x, y, label='y', s=0.1)
    plt.scatter(x, pred, label='pred', s=0.1)
    plt.legend()
    plt.show()