# 결측치 탐색
수집된 ITS 소통정보데이터 중 2023년 1월의 5분 주기로 측정된 모든 데이터에서 결측치 탐색한다.

In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as ddf

## 데이터 로드
pandas만을 사용한다면 메모리 용량의 한계에 다다르기 때문에 대규모 용량의 데이터에 최적화된 패키지인 dask를 사용하여 데이터를 로드 하였다.

In [2]:
for i in range(31):
    # Dask DataFrame을 읽어옵니다.
    df = ddf.read_csv(f'/Volumes/Expansion/traffic-prediction/data/its-소통/1/its_c_1_{i + 1}.csv', dtype='object')

    # 열 이름 지정
    df.columns = ['생성일', '생성시분', '링크ID', '도로관리기관', '통행속도', '통행시간']
    
    # '도로관리기관' 열이 존재하는지 확인 후 드롭
    if '도로관리기관' in df.columns:
        df = df.drop('도로관리기관', axis=1)
    
    # 데이터 타입 변경
    df = df.astype({
        '생성일': 'int64',
        '생성시분': 'int64',
        '링크ID': 'object',
        '통행속도': 'float64',
        '통행시간': 'int64'
    })
    
    # CSV로 저장
    output_path = f'/Volumes/Expansion/traffic-prediction/data/its-소통/1/its_c_1_{i + 1}_m1.csv'
    df.to_csv(output_path, index=False, single_file=True)
    
    # 메모리에서 Dask DataFrame 해제
    del df

### 1일

In [14]:
its_c_1_1_dask = ddf.read_csv('/Volumes/Expansion/traffic-prediction/data/its-소통/1/its_c_1_1.csv', dtype={'1000000100': 'object', '35': 'float64'})

In [15]:
# its_c_1_1_dask.columns = ['1_c', '2_c','3_c', '4_c', '5_c', '6_c']

In [16]:
its_c_1_1_dask.head()

Unnamed: 0,20230101,0000,1000000100,Unnamed: 3,35,0
0,20230101,0,1000000200,,36.0,0
1,20230101,0,1000000300,,13.0,0
2,20230101,0,1000000301,,22.0,0
3,20230101,0,1000000302,,21.0,0
4,20230101,0,1000000303,,6.0,0


In [17]:
for partition in its_c_1_1_dask.to_delayed():
    partition_its_c_1_1 = partition.compute()

In [18]:
partition_its_c_1_1

Unnamed: 0,20230101,0000,1000000100,Unnamed: 3,35,0
0,20230101,2345,2510159608,,26.0,0
1,20230101,2345,2510159700,,25.5,0
2,20230101,2345,2510159800,,21.0,0
3,20230101,2345,2510159900,,31.0,0
4,20230101,2345,2510159901,,20.5,0
...,...,...,...,...,...,...
1616864,20230101,2355,4180383901,,4.0,0
1616865,20230101,2355,4180384001,,97.0,0
1616866,20230101,2355,4180384101,,76.0,0
1616867,20230101,2355,4180384201,,115.0,0


In [19]:
partition_its_c_1_1.columns = ['생성일', '생성시분', '링크ID', '도로관리기관', '통행속도', '통행시간']

In [20]:
partition_its_c_1_1.tail()

Unnamed: 0,생성일,생성시분,링크ID,도로관리기관,통행속도,통행시간
1616864,20230101,2355,4180383901,,4.0,0
1616865,20230101,2355,4180384001,,97.0,0
1616866,20230101,2355,4180384101,,76.0,0
1616867,20230101,2355,4180384201,,115.0,0
1616868,20230101,2355,4180384301,,79.0,0


In [21]:
null_data = partition_its_c_1_1.isnull().sum()

In [22]:
# csv로 저장
null_data.to_csv('2023_1_1_소통_null.csv', index=False)

In [35]:
partition_its_c_1_1['도로관리기관'].describe

<bound method NDFrame.describe of 0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
           ..
1616864   NaN
1616865   NaN
1616866   NaN
1616867   NaN
1616868   NaN
Name: 도로관리기관, Length: 1616869, dtype: float64>

### 2일

In [1]:
import os
for i in range(31):
    path = f'/Volumes/Expansion/traffic-prediction/data/its-소통/1/its_c_1_{i + 1}.csv'
    if os.path.exists(path):
        os.remove(path)