In [1]:
# -*- coding: utf-8 -*-
import pickle
import pandas as pd
import numpy as np

import torch

from sklearn.model_selection import StratifiedKFold

from collections import defaultdict
import re

In [2]:
with open("/opt/ml/input/data/label_type.pkl", 'rb') as f:
    label_type = pickle.load(f) 

In [1]:
label_type

# Train data 분포도 분석

In [4]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [5]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({
        'id' : dataset[0],
        'sentence':dataset[1],
        'entity_01':dataset[2],
        'entity_01_start' : dataset[3],
        'entity_01_end' : dataset[4],
        'entity_02':dataset[5],
        'entity_02_start' : dataset[6],
        'entity_02_end' : dataset[7],
        'class':dataset[8],
        'label' : label,
    })
    return out_dataset

In [6]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset

In [7]:
def tokenized_dataset(dataset, tokenizer, ent_token = False):
    concat_entity = []
    concat_sentence = []
    for e01, e02, e1s,e1e,e2s,e2e, sentence in zip(
        dataset['entity_01'], 
        dataset['entity_02'],
        dataset['entity_01_start'],
        dataset['entity_01_end'], 
        dataset['entity_02_start'],
        dataset['entity_02_end'],
        dataset['sentence']):

        temp = ''
        temp = e01 + '[SEP]' + e02
        concat_entity.append(temp)

        if ent_token:
            if e1s < e2s:
                sentence = sentence[:e1s] + \
                '[ENT]' + \
                sentence[e1s:e1e+1] + \
                '[/ENT]' + \
                sentence[e1e+1:e2s] + \
                '[ENT]' +  \
                sentence[e2s:e2e+1] + \
                '[/ENT]'+ \
                sentence[e2e+1:]
            elif e1s >= e2s:
                sentence = sentence[:e2s] + \
                '[ENT]' + \
                sentence[e2s:e2e+1] + \
                '[/ENT]' + \
                sentence[e2e+1:e1s] + \
                '[ENT]' +  \
                sentence[e1s:e1e+1] + \
                '[/ENT]'+ \
                sentence[e1e+1:]
            concat_sentence.append(sentence)
    
    # 토큰 옵션 있는 경우
    if ent_token: 
        tokenized_sentences = tokenizer(
            concat_entity,
            concat_sentence,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )
    # 토큰 옵션 없는 경우
    else: 
        tokenized_sentences = tokenizer(
            concat_entity,
            list(dataset['sentence']),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )
        
    return tokenized_sentences

In [11]:
df = pd.read_csv("/opt/ml/input/data/train/EDA/aug_EDA_train.tsv",delimiter='\t',header=None)

In [24]:
import math
start_idx = 0 

d = 0
for i, data in enumerate(df[4]):
    if math.isnan(data):
        start_idx = i
        break

In [2]:
start_idx

In [55]:
skf = StratifiedKFold(n_splits = 10)

df_sentence = []
df_labels = []
for sentence in df[0][start_idx:]:
    df_sentence.append(sentence)
    
for label in df[3][start_idx:]:
    df_labels.append(label)
    
df_sentence = np.array(df_sentence)
df_label = np.array(df_labels)

In [3]:
print(len(df_sentence))

In [4]:
print(len(df_label))

In [5]:
df_sentence[0]

In [6]:
df_label[0]

In [7]:
df_train_index = None
df_test_index = None

for train_index, test_index in skf.split(df_sentence, df_label):
    df_train_index = train_index
    df_test_index = test_index
    break

In [10]:
print(len(df_train_index))
print(len(df_test_index))

for i in range(len(df_train_index)):
    df_train_index[i] += start_idx
for i in range(len(df_test_index)):
    df_test_index[i] += start_idx

In [62]:
df_train_split = pd.concat([df.iloc[:start_idx],df.iloc[df_train_index]])
df_test_split = df.iloc[df_test_index]

In [63]:
df_train_split.to_csv('/opt/ml/input/data/train/EDA/aug_train_EDA(train).tsv', index =False, header = None, sep = "\t")

In [8]:
df_train_split 

In [65]:
df_test_split.to_csv('/opt/ml/input/data/train/EDA/aug_train(test).tsv', index =False, header = None, sep = "\t")

In [9]:
df_test_split