In [1]:
import pandas as pd
import numpy as np

# Load the TSV file into a pandas dataframe
df = pd.read_csv('dem_beh_eval_label.tsv', delimiter='\t')

# Split the dataframe by label
label_0 = df[df.iloc[:, 3] == 0]
label_1 = df[df.iloc[:, 3] == 1]

# Calculate the number of samples for each set
n_0 = len(label_0)
n_1 = len(label_1)

train_size = int(0.8 * (n_0 + n_1))
val_size = int(0.1 * (n_0 + n_1))

# Shuffle the samples for each label
label_0 = label_0.sample(frac=1, random_state=42)
label_1 = label_1.sample(frac=1, random_state=42)

# Split each label into sets
train_0 = label_0[:int(train_size * n_0 / (n_0 + n_1))]
train_1 = label_1[:int(train_size * n_1 / (n_0 + n_1))]

val_0 = label_0[int(train_size * n_0 / (n_0 + n_1)):int((train_size + val_size) * n_0 / (n_0 + n_1))]
val_1 = label_1[int(train_size * n_1 / (n_0 + n_1)):int((train_size + val_size) * n_1 / (n_0 + n_1))]

test_0 = label_0[int((train_size + val_size) * n_0 / (n_0 + n_1)):]
test_1 = label_1[int((train_size + val_size) * n_1 / (n_0 + n_1)):]

# Concatenate the sets for each label
train = pd.concat([train_0, train_1])
val = pd.concat([val_0, val_1])
test = pd.concat([test_0, test_1])

# Shuffle the sets
train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# Save the sets to separate TSV files
train.to_csv('train.tsv', sep='\t', index=False)
val.to_csv('val.tsv', sep='\t', index=False)
test.to_csv('test.tsv', sep='\t', index=False)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7279 entries, 6100 to 4415
Data columns (total 4 columns):
 #   Column                                                                                                                                          Non-Null Count  Dtype 
---  ------                                                                                                                                          --------------  ----- 
 0   2000년 출생, 남자, 대한민국 국적, 대전광역시 서구 내동에 있는 공립 외국어고등학교인 대전외국어고등학교 2019년 졸업, 신입 수시1 가톨릭교회지도자추천 2019년 입학, 국제·법정경계열 국제학부 전공, 경제학과 복수전공, 국가장학금 받지 않음 수령  7279 non-null   object
 1   전공성적평균 매우 미흡, 교양성적평균 보통, 동아리활동 함, 학술활동 하지 않음, 봉사활동 하지 않음, 국가장학금 외 장학금 평균보다 매우 적게 받음                                                             7279 non-null   object
 2   강의평가 : 보통, 강의 평가 세부 내용 : 평가없음                                                                                                                   7279 non-null   object
 3   0                         

In [18]:
df_counts = df.iloc[:, 3].value_counts()
train_counts = train.iloc[:, 3].value_counts()
val_counts = val.iloc[:, 3].value_counts()
test_counts = test.iloc[:, 3].value_counts()

In [19]:
df_counts

0    8337
1     763
Name: 0, dtype: int64

In [11]:
train_counts

0    6669
1     610
Name: 0, dtype: int64

In [13]:
val_counts

0    834
1     76
Name: 0, dtype: int64

In [14]:
test_counts

0    834
1     77
Name: 0, dtype: int64

In [15]:
len(train)

7279

In [16]:
len(val)

910

In [17]:
len(test)

911