# Preparing datasets

In [1]:
# %config InlineBackend.figure_format='retina'
from ekorpkit import eKonf

eKonf.setLogger("INFO")
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())

INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client
INFO:ekorpkit.base:Google Colab not detected.


version: 0.1.35+6.g44b548b.dirty
is notebook? True
is colab? False
evironment varialbles:
{'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_LOG_LEVEL': 'INFO',
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'NUM_WORKERS': 230}


In [2]:
data_dir = "../data/esg"

## Fetch the labeled dataset from the labelstudio server

In [116]:
cfg = eKonf.compose("io/fetcher=labelstudio")
cfg.name = "esg_polarity_labels"
cfg.project_id = 2
ls = eKonf.instantiate(cfg)

INFO:ekorpkit.io.fetch.base:/workspace/.tmp/fetcher/esg_polarity_labels/esg_polarity_labels.parquet already exists. skipping..
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/.tmp/fetcher/esg_polarity_labels/esg_polarity_labels.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/.tmp/fetcher/esg_polarity_labels/esg_polarity_labels.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/.tmp/fetcher/esg_polarity_labels/esg_polarity_labels.parquet


In [151]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

esg_polarity_data = ls.data.copy()

le = preprocessing.LabelEncoder()
le.fit(esg_polarity_data.labels)
classes = le.classes_
esg_polarity_data["classes"] = le.transform(esg_polarity_data.labels)

test_size = 0.2
random_state = 12345
shuffle = True

df_train, df_test = train_test_split(
    esg_polarity_data, test_size=test_size, random_state=random_state, shuffle=shuffle
)
print(len(df_train), len(df_test))
df_train.tail()

13936 3485


Unnamed: 0,id,text,annot_id,annotator,origin,labels,classes
13441,20564,"군집분석을 통해 네 개의 군집으로 분류한 결과, '1차 금속 제조업', '고무제품 ...",6833,6,manual,Neutral,1
3613,45534,세아네트웍스 통신부품 판매 및 공장자동화 및 사무자동화 구축 용역을 주 사업부문으로...,5076,5,manual,Neutral,1
10522,23616,"이에 따라, 양로연금의 지급 방법은 기업이 퇴직자에게 직접 지급하던 종래의 지급방식...",10232,6,manual,Neutral,1
1728,51904,"이에 따라 에너지 절약에 대한 정부의 정책적 지원 강화가 예상되는 가운데,제도 점차...",11431,5,manual,Neutral,1
10559,23579,법규 및 제도\n정의화 국회의장이 11월 27일 지정한 세입 예산안 부수 법률안 1...,6456,8,manual,Positive,2


## Writing Labeling Functions

Each crowdworker can be thought of as a single labeling function, as each worker labels a subset of data points, and may have errors or conflicting labels with other workers / labeling functions. Labeling fucntions will simply return the label the worker submitted for a given text, and abstain if they didn't submit a label for it.

### Crowdworker labeling functions

In [152]:
labels_by_annotator = esg_polarity_data.groupby("annotator")
worker_dicts = {}
for worker_id in labels_by_annotator.groups:
    worker_df = labels_by_annotator.get_group(worker_id)
    worker_dicts[worker_id] = dict(zip(worker_df.id, worker_df.classes))

print("Number of workers:", len(worker_dicts))

Number of workers: 9


In [153]:
from ekorpkit.models.snorkel.labeling import LabelingFunction

ABSTAIN = -1

def worker_lf(x, worker_dict):
    return worker_dict.get(x.id, ABSTAIN)


def make_worker_lf(worker_id):
    worker_dict = worker_dicts[worker_id]
    name = f"worker_{worker_id}"
    return LabelingFunction(name, f=worker_lf, resources={"worker_dict": worker_dict})


worker_lfs = [make_worker_lf(worker_id) for worker_id in worker_dicts]

In [154]:
from ekorpkit.models.snorkel.labeling import PandasLFApplier

applier = PandasLFApplier(worker_lfs)
L_train = applier.apply(df_train)
L_test = applier.apply(df_test)
L_data = applier.apply(esg_polarity_data)

100%|██████████| 13936/13936 [00:00<00:00, 15669.46it/s]
100%|██████████| 3485/3485 [00:00<00:00, 15570.77it/s]
100%|██████████| 17421/17421 [00:01<00:00, 15763.39it/s]


In [155]:
from ekorpkit.models.snorkel.labeling import LFAnalysis

Y_train = df_train.classes.values
LFAnalysis(L_train, worker_lfs).lf_summary(Y_train).sample(5)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
worker_8,7,"[0, 1, 2]",0.379879,0.340413,0.00775,5239,55,0.989611
worker_1,0,"[0, 1, 2]",0.000574,0.000359,0.0,8,0,1.0
worker_4,3,"[0, 1, 2]",0.091131,0.077425,0.001005,1262,8,0.993701
worker_7,6,"[0, 1, 2]",0.176521,0.122489,0.003875,2432,28,0.988618
worker_3,2,"[0, 1, 2]",0.083525,0.03064,0.000287,1162,2,0.998282


In [156]:
print(f"Training set coverage: {100 * LFAnalysis(L_train).label_coverage(): 0.1f}%")

Training set coverage:  100.0%


## Train LabelModel And Generate Probabilistic Labels

In [157]:
from ekorpkit.models.snorkel.labeling.model import LabelModel

# Train LabelModel.
label_model = LabelModel(cardinality=3, verbose=False)
label_model.fit(L_train, n_epochs=100, seed=12345, log_freq=20, l2=0.1, lr=0.01)

100%|██████████| 100/100 [00:00<00:00, 1010.20epoch/s]


In [158]:
from ekorpkit.models.snorkel.analysis import metric_score

Y_test = df_test.classes.values

preds_train = label_model.predict(L_train, tie_break_policy="abstain")
acc = metric_score(Y_train, preds_train, probs=None, metric="accuracy")
print(f"LabelModel Accuracy for train: {acc:.3f}")

preds_test = label_model.predict(L_test, tie_break_policy="abstain")
acc = metric_score(Y_test, preds_test, probs=None, metric="accuracy")
print(f"LabelModel Accuracy for test: {acc:.3f}")

LabelModel Accuracy for train: 0.995
LabelModel Accuracy for test: 0.994


In [159]:
esg_polarity_data["snorkel_classes"] = label_model.predict(L_data, tie_break_policy="abstain")
esg_polarity_data = esg_polarity_data[esg_polarity_data.snorkel_classes != ABSTAIN]

esg_polarity_data["labels"] = le.inverse_transform(esg_polarity_data["snorkel_classes"])
esg_polarity_data.query("classes != snorkel_classes")

Unnamed: 0,id,text,annot_id,annotator,origin,labels,classes,snorkel_classes
38,29214,재판부가 선고를 미루며 학부모에게 용서를 빌 기회를 준 것은 이번 사건이 사회적 파...,15298,6,manual,Positive,1,2
55,25177,"삼성정밀화학, 폴리실리콘 공장 건설 시동\n삼성정밀화학과 미국 memc사가 합작으로...",15035,7,manual,Positive,1,2
70,24388,그렇게 하기 위해서는 가령 녹색유통채널을 구축하고 녹색소비자에 대한 소비실적에 대해...,13725,6,manual,Positive,1,2
71,24368,이렇게 중소기업의 범위는 비록 복잡하지만 뚜렷하게 정의되어 있으나 중견기업의 범위는...,13711,6,manual,Positive,1,2
78,24266,사회적기업 육성을 위한 자본시장 지원방안\n취약계층에게 사회서비스 또는 일자리를 제...,14983,8,manual,Positive,0,2
...,...,...,...,...,...,...,...,...
13281,20730,국내 온실가스 관련 환경법률 개정 동향 안 세 환 연구원\n환경부는 배출권거래제 시...,16226,6,manual,Negative,1,0
13315,20696,따라서 현재 정책의 한계와 문제 점을 적극적으로 확인하고 사회 전체적으로 온실가스 ...,218,2,manual,Neutral,0,1
13553,20447,"우리나라는 온실가스 감축 비의무국가중 처음으로'저탄소 녹색성장 기본법'제정하였으며,...",1460,4,manual,Positive,1,2
13559,20441,"환경부는 2011년 12월 환경정보 공개제도 운영규정 을 고시하고, 녹색기업, 공공...",1457,4,manual,Positive,1,2


In [162]:
cols = ['id', 'text', "labels"]
esg_polarity_snorkel_data = esg_polarity_data[cols].drop_duplicates(subset=cols).reset_index(drop=True)
eKonf.save_data(esg_polarity_snorkel_data, "esg_polarity_snorkel_data.parquet", data_dir)

INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_snorkel_data.parquet


## Build a dataset using the data generated by the label model


In [8]:
cfg = eKonf.compose("dataset=dataset_build")
cfg.name = "esg_polarity_kr"
cfg.data_dir = data_dir
cfg.data_file = "esg_polarity_snorkel_data.parquet"
cfg.force.build = True
cfg.pipeline.split_sampling.stratify_on = "labels"
cfg.pipeline.split_sampling.random_state = 123
cfg.pipeline.split_sampling.test_size = 0.2
cfg.pipeline.split_sampling.dev_size = 0.2
cfg.pipeline.reset_index.drop_index = True
cfg.verbose = False
esg_polarity_ds = eKonf.instantiate(cfg)
esg_polarity_ds.persist()

INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('load_dataframe', 'load_dataframe'), ('reset_index', 'reset_index'), ('split_sampling', 'split_sampling')])
INFO:ekorpkit.base:Applying pipe: functools.partial(<function load_dataframe at 0x7f553803bf70>)
INFO:ekorpkit.io.file:Processing [1] files from ['esg_polarity_snorkel_data.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/esg/esg_polarity_snorkel_data.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/esg/esg_polarity_snorkel_data.parquet
INFO:ekorpkit.base:Applying pipe: functools.partial(<function reset_index at 0x7f553803b1f0>)
INFO:ekorpkit.base:Applying pipe: functools.partial(<function split_sampling at 0x7f5538032dc0>)
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg_polarity_kr-train.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg_polarity_kr-test.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg

apply len_bytes to num_bytes:   0%|          | 0/230 [00:00<?, ?it/s]

INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:00.276040
INFO:ekorpkit.base:Using batcher with minibatch size: 10
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 10  procs: 230  input_split: False  merge_output: True  len(data): 2179 len(args): 5


apply len_bytes to num_bytes:   0%|          | 0/218 [00:00<?, ?it/s]

INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:00.250046
INFO:ekorpkit.base:Using batcher with minibatch size: 12
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 12  procs: 230  input_split: False  merge_output: True  len(data): 2724 len(args): 5


apply len_bytes to num_bytes:   0%|          | 0/227 [00:00<?, ?it/s]

INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:00.253649
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg_polarity_kr-train.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg_polarity_kr-dev.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/esg/esg_polarity_kr/esg_polarity_kr-test.parquet
