In [8]:
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from pathlib import Path
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier


In [9]:
warnings.filterwarnings("ignore")

In [10]:
# 파일 불러오기
def dataframe_from_csv(target):
  return pd.read_csv(target).rename(columns=lambda x:x.strip())

def dataframe_from_csvs(targets):
  return pd.concat([dataframe_from_csv(x) for x in targets])

In [11]:
data_path = 'dataset'

train_files = sorted([x for x in Path(f'{data_path}/train/').glob('*.csv')])
val_files = sorted([x for x in Path(f'{data_path}/val/').glob('*.csv')])

train = dataframe_from_csvs(train_files)
val = dataframe_from_csvs(val_files)
test = pd.read_csv(f'{data_path}/test.csv')
print(f'train: {len(train)}')
print(f'validation: {len(val)}')
print(f'test: {len(test)}')

train: 62564
validation: 7820
test: 7820


In [12]:
train = train.drop(['site', 'sid'], axis=1)
val = val.drop(['site', 'sid'], axis=1)
test = test.drop(['site', 'sid'], axis=1)
train['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
val['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
test['leaktype']=""

# train을 target과 feature로 나눠줍니다.
train_x=train.drop(['leaktype'], axis=1)
train_y=train['leaktype']
val_x=val.drop(['leaktype'], axis=1)
val_y=val['leaktype']
test_x=test.drop(['leaktype'], axis=1)
test_y=test['leaktype']

In [13]:
def print_num_of_classes(df, name):
    num2label = ['out','in','noise','other','normal']
    count_dict = dict(Counter(list(df)))
    sum_num = sum([v for v in count_dict.values()])
    
    print(f'----LABELS in {name}_dataset----\n')
    for k, v in count_dict.items():
        print(f'{num2label[k]}({k}): {v} 개 ({v / sum_num * 100:0.2f}%)')
    
    print('\n\n')

In [14]:
print_num_of_classes(train_y, name='train')
print_num_of_classes(val_y, name='val')

----LABELS in train_dataset----

out(0): 17539 개 (28.03%)
in(1): 13273 개 (21.22%)
noise(2): 5029 개 (8.04%)
other(3): 7019 개 (11.22%)
normal(4): 19704 개 (31.49%)



----LABELS in val_dataset----

out(0): 2192 개 (28.03%)
in(1): 1659 개 (21.21%)
noise(2): 629 개 (8.04%)
other(3): 878 개 (11.23%)
normal(4): 2462 개 (31.48%)





In [17]:
# 학습
df_clf.fit(train_x, train_y)

In [18]:
# val 추론
val_pred = df_clf.predict(val_x)
print("Validation F1 score: ", f1_score(val_y, val_pred, average='macro'))

Validation F1 score:  0.5691106664340266


In [19]:
# test 추론
test_pred = df_clf.predict(test_x)

In [20]:
# 제출파일 생성
submission = pd.read_csv(f'{data_path}/sample_submission.csv')
submission['leaktype']=test_pred

submission.to_csv(f'{data_path}/submission1.csv', index=False)