# Dataset Split

## Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

In [None]:
posi_mh_main = pd.read_csv('dataset/posi_mh_main.csv', dtype={'SEM_No': object})

## Preprocessing

### Remain DOT Group

In [None]:
dot_grp = posi_mh_main.query('group == "DOT"')
dot_grp = dot_grp.reset_index(drop=True)

### Label Class

```
Class -1 => Abnormal
Class 1 => Normal
```

In [None]:
dot_grp['Class'] = dot_grp['Error'].apply(lambda x: -1 if x == 'E' else 1)

In [None]:
abnormal = dot_grp[dot_grp['Class'] == -1]
abnormal = abnormal.reset_index(drop=True)
normal = dot_grp[dot_grp['Class'] == 1]
normal = normal.reset_index(drop=True)

## Split

- split 하기 위해선 SEM_NO랑 Class 열만 있으면 됨
- train test ratio를 맞춰서 split 해야함
    - Abnormal: 13개 (12%)
    - Normal: 95개 (87%)
- 대충 Normal sample 10개를 test set에 포함시키면 (21:79)가 됨

In [None]:
total_sem_img = dot_grp.shape[0]

In [None]:
normal_ratio = normal.shape[0] / total_sem_img
abnormal_ratio = abnormal.shape[0] / total_sem_img
print(f'Normal Ratio: {normal_ratio}')
print(f'Abnormal Ratio: {abnormal_ratio}')

In [None]:
test_normal_set = normal.sample(10)

normal set에서 뺀 10개를 test set에 추가하고 기존 normal set에서 제거

In [None]:
train_set = (normal[~normal.index.isin(test_normal_set.index)]
             .reset_index(drop=True))

In [None]:
test_set = pd.concat([test_normal_set, abnormal], axis=0, ignore_index=True)

In [None]:
print(f'Train set size: {train_set.shape}')
print(f'Test set size: {test_set.shape}')

In [None]:
test_y = test_set['Class']

In [None]:
percentage = test_y.value_counts(normalize=True)
plt.title('Test Set Target Class Ratio')
plt.pie(percentage, labels=percentage.index, autopct='%.1f%%')
plt.legend();

## Save

In [None]:
col = ['SEM_No', 'width', 'duty', 'design', 'TARGET', 'CD', 'Class']
train_set[col].to_csv('dataset/dot_train.csv', index=False)
test_set[col].to_csv('dataset/dot_test.csv', index=False)