In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('balanced_dataset.csv')
df.head(10)

Unnamed: 0,Path,Vehicle_ID,Turn,Lane,Speed,Hazard,ISSUE_DATE,VEHICLE_ID,VEHICLE_CLASS,LONGITUDE,LATITUDE,HEADING,SPEED,BRAKE_STATUS,ACC_SEC,VEHICLE_TYPE,CURRENT_LANE,Group_ID
0,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:10,A4ED90F2,9228.0,126.857587,35.144362,264.0,7.0,0.0,28.0,bus,0,0
1,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:11,A4ED90F2,9228.0,126.857561,35.14436,264.0,7.0,0.0,0.0,bus,0,0
2,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:12,A4ED90F2,9228.0,126.857536,35.144357,262.0,7.0,0.0,28.0,bus,0,0
3,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:13,A4ED90F2,9228.0,126.85751,35.144357,266.0,7.0,0.0,0.0,bus,0,0
4,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:14,A4ED90F2,9228.0,126.857482,35.144357,274.0,9.0,0.0,28.0,bus,0,0
5,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:15,A4ED90F2,9228.0,126.857453,35.14436,281.0,11.0,0.0,56.0,bus,0,0
6,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:16,A4ED90F2,9228.0,126.857419,35.144373,303.0,13.0,0.0,56.0,bus,0,0
7,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:17,A4ED90F2,9228.0,126.857387,35.144399,325.0,12.0,0.0,-28.0,bus,0,0
8,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:18,A4ED90F2,9228.0,126.857366,35.144429,338.0,14.0,0.0,56.0,bus,0,0
9,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:19,A4ED90F2,9228.0,126.857356,35.144466,354.0,16.0,0.0,56.0,bus,0,0


In [4]:
def split_by_group_id(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Group_ID 단위로 데이터를 70:15:15로 나눔

    Parameters:
    - df (pd.DataFrame): Group_ID가 포함된 데이터프레임
    - train_ratio (float): Training 데이터 비율
    - val_ratio (float): Validation 데이터 비율
    - test_ratio (float): Test 데이터 비율

    Returns:
    - train_df, val_df, test_df: 나뉜 데이터프레임
    """
    # 1. Group_ID별로 고유 그룹 리스트 가져오기
    group_ids = df['Group_ID'].unique()
    
    # 2. Group_ID를 Training, Validation, Test로 나누기
    train_ids, temp_ids = train_test_split(
        group_ids, test_size=(1 - train_ratio), random_state=42 # train_ids : temp_ids = 7 : 3
    )
    val_ids, test_ids = train_test_split(
        temp_ids, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42 # temp_ids : test_size = 1 : 1
    )
    
    print(f"Train Group_ID 개수: {len(train_ids)}")
    print(f"Validation Group_ID 개수: {len(val_ids)}")
    print(f"Test Group_ID 개수: {len(test_ids)}")
    
    # 3. 각 세트에 해당하는 데이터 가져오기
    train_df = df[df['Group_ID'].isin(train_ids)]
    val_df = df[df['Group_ID'].isin(val_ids)]
    test_df = df[df['Group_ID'].isin(test_ids)]
    
    return train_df, val_df, test_df

In [5]:
# 데이터셋 split
train_df, val_df, test_df = split_by_group_id(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

Train Group_ID 개수: 31674
Validation Group_ID 개수: 6787
Test Group_ID 개수: 6788


In [10]:
train_df.head(12)

Unnamed: 0,Path,Vehicle_ID,Turn,Lane,Speed,Hazard,ISSUE_DATE,VEHICLE_ID,VEHICLE_CLASS,LONGITUDE,LATITUDE,HEADING,SPEED,BRAKE_STATUS,ACC_SEC,VEHICLE_TYPE,CURRENT_LANE,Group_ID
20,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:30,59168562,9228.0,126.920616,35.142475,350.0,23.0,0.0,28.0,bus,0,2
21,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:31,59168562,9228.0,126.92061,35.142538,358.0,24.0,0.0,28.0,bus,0,2
22,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:32,59168562,9228.0,126.920613,35.1426,6.0,23.0,0.0,0.0,bus,0,2
23,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:33,59168562,9228.0,126.920626,35.142662,10.0,24.0,0.0,28.0,bus,0,2
24,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:34,59168562,9228.0,126.920649,35.142725,19.0,25.0,0.0,28.0,bus,0,2
25,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:35,59168562,9228.0,126.920676,35.142792,18.0,28.0,0.0,28.0,bus,0,2
26,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:36,59168562,9228.0,126.920707,35.142862,20.0,30.0,0.0,56.0,bus,0,2
27,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:37,59168562,9228.0,126.92074,35.142933,21.0,31.0,0.0,28.0,bus,0,2
28,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:38,59168562,9228.0,126.920777,35.143011,20.0,32.0,0.0,56.0,bus,0,2
29,V_220809_C_A_59168562_011790.csv,59168562,Right,R-Side,False,True,2022-08-09 15:42:39,59168562,9228.0,126.92082,35.143099,21.0,33.0,0.0,28.0,bus,0,2


In [9]:
val_df.head(12)

Unnamed: 0,Path,Vehicle_ID,Turn,Lane,Speed,Hazard,ISSUE_DATE,VEHICLE_ID,VEHICLE_CLASS,LONGITUDE,LATITUDE,HEADING,SPEED,BRAKE_STATUS,ACC_SEC,VEHICLE_TYPE,CURRENT_LANE,Group_ID
10,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:37,FFE811C9,9220.0,126.871459,35.134052,52.0,10.0,80.0,0.0,car,0,1
11,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:38,FFE811C9,9220.0,126.871488,35.134069,54.0,12.0,80.0,56.0,car,0,1
12,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:39,FFE811C9,9220.0,126.871516,35.134086,52.0,10.0,80.0,-56.0,car,0,1
13,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:40,FFE811C9,9220.0,126.871537,35.134098,51.0,5.0,80.0,-83.0,car,0,1
14,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:41,FFE811C9,9220.0,126.871548,35.134105,51.0,2.0,80.0,-111.0,car,0,1
15,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:42,FFE811C9,9220.0,126.871554,35.134108,51.0,2.0,80.0,0.0,car,0,1
16,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:43,FFE811C9,9220.0,126.87156,35.134112,51.0,3.0,80.0,28.0,car,0,1
17,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:44,FFE811C9,9220.0,126.87157,35.134118,51.0,4.0,80.0,28.0,car,0,1
18,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:45,FFE811C9,9220.0,126.871583,35.134127,55.0,4.0,80.0,0.0,car,0,1
19,V_221105_C_F_FFE811C9_000103.csv,FFE811C9,False,False,False,True,2022-11-05 11:40:46,FFE811C9,9220.0,126.871599,35.134136,51.0,6.0,80.0,28.0,car,0,1


In [11]:
test_df.head(12)

Unnamed: 0,Path,Vehicle_ID,Turn,Lane,Speed,Hazard,ISSUE_DATE,VEHICLE_ID,VEHICLE_CLASS,LONGITUDE,LATITUDE,HEADING,SPEED,BRAKE_STATUS,ACC_SEC,VEHICLE_TYPE,CURRENT_LANE,Group_ID
0,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:10,A4ED90F2,9228.0,126.857587,35.144362,264.0,7.0,0.0,28.0,bus,0,0
1,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:11,A4ED90F2,9228.0,126.857561,35.14436,264.0,7.0,0.0,0.0,bus,0,0
2,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:12,A4ED90F2,9228.0,126.857536,35.144357,262.0,7.0,0.0,28.0,bus,0,0
3,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:13,A4ED90F2,9228.0,126.85751,35.144357,266.0,7.0,0.0,0.0,bus,0,0
4,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:14,A4ED90F2,9228.0,126.857482,35.144357,274.0,9.0,0.0,28.0,bus,0,0
5,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:15,A4ED90F2,9228.0,126.857453,35.14436,281.0,11.0,0.0,56.0,bus,0,0
6,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:16,A4ED90F2,9228.0,126.857419,35.144373,303.0,13.0,0.0,56.0,bus,0,0
7,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:17,A4ED90F2,9228.0,126.857387,35.144399,325.0,12.0,0.0,-28.0,bus,0,0
8,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:18,A4ED90F2,9228.0,126.857366,35.144429,338.0,14.0,0.0,56.0,bus,0,0
9,V_220902_C_A_A4ED90F2_066801.csv,A4ED90F2,Right,R-Side,False,True,2022-09-02 13:11:19,A4ED90F2,9228.0,126.857356,35.144466,354.0,16.0,0.0,56.0,bus,0,0


In [12]:
# 각 데이터프레임 저장
train_df.to_csv("train_dataset.csv", index=False)
val_df.to_csv("val_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)