## Prepare annotation data for the full task
* This notebook demonstrates the workflow for preparing input dataframes to upload to AMT when launching annotation tasks.
* Note that the steps shown here is for demonstration purposes only, the actual steps performed during the real annotation phase were different due to adjustment and revision.

### Imports

In [None]:
import pandas as pd
import os
import random
import pprint
import ujson

random.seed(5)

pd.options.display.max_columns=100

import emoji

### Functions

In [None]:
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

### Step 1. Data EDA
* Note that here we are using the sampled original GDELT data for demonstration, the unsampled original data is too big to reside in this repo.

In [None]:
df = pd.read_csv('../dataframes/sm_gdelt_data.csv')
df.info()

In [None]:
df.sample(5)

### Step 2. Column sampling
* Only keep relevant columns
* Remove NA values

In [None]:
anno_df = df[['GLOBALEVENTID', 'text', 'title', 'publish_date']]
anno_df = anno_df.dropna()

## Step 3. Dedupe
* Remove existing samples to avoid sample duplicates
* Here we use the 5th batch as an example, skip this step if this is for the first batch

In [None]:
batch_1_df = pd.read_csv('../dataframes/amt_full_task_batch_1.csv')
batch_2_df = pd.read_csv('../dataframes/amt_full_task_batch_2.csv')
batch_3_df = pd.read_csv('../dataframes/amt_full_task_batch_3.csv')
batch_4_df = pd.read_csv('../dataframes/amt_full_task_batch_4.csv')
batch_1_ids = set(batch_1_df.GLOBALEVENTID.to_list())
batch_2_ids = set(batch_2_df.GLOBALEVENTID.to_list())
batch_3_ids = set(batch_3_df.GLOBALEVENTID.to_list())
batch_4_ids = set(batch_4_df.GLOBALEVENTID.to_list())
old_batch_ids = batch_1_ids.union(batch_2_ids).union(batch_3_ids).union(batch_4_ids)

In [None]:
anno_df = anno_df[~(anno_df.GLOBALEVENTID.isin(old_batch_ids))]
len(anno_df)

## Step 4. Keywords filtering & stratified sampling
* Here we look for certain samples to balance the following classes: trade unionist, human right defenders, torture, and kidnapping
* Then apply stratified sampling across all group

In [None]:
# do keywords filtering before sampling
# 1st filtering: trade unionist
# 2nd filtering: human right defenders
# 3rd filtering: torture
# 4th filtering: kidnapping

In [None]:
anno_df_1 = anno_df[anno_df.text.str.contains('(?i)trade union')]
len(anno_df_1)

In [None]:
anno_df_2 = anno_df[anno_df.text.str.contains('(?i)human right')]
len(anno_df_2)

In [None]:
anno_df_3 = anno_df[anno_df.text.str.contains('(?i)torture')]
len(anno_df_3)

In [None]:
anno_df_4 = anno_df[anno_df.text.str.contains('(?i)kidnapping')]
len(anno_df_4)

In [None]:
# sample 2 from each
# we use 2 as example here, change this number for your specific case
# for example, if total HITs = 100, then sample_n = 25
sample_n = 2
sample_1 = anno_df_1.sample(sample_n)
sample_1_ids = set(sample_1.GLOBALEVENTID.to_list())

sample_2 = anno_df_2[~(anno_df_2.GLOBALEVENTID.isin(sample_1_ids))].sample(sample_n)
sample_2_ids = set(sample_2.GLOBALEVENTID.to_list()).union(sample_1_ids)

sample_3 = anno_df_3[~(anno_df_3.GLOBALEVENTID.isin(sample_2_ids))].sample(sample_n)
sample_3_ids = set(sample_3.GLOBALEVENTID.to_list()).union(sample_2_ids)

sample_4 = anno_df_4[~(anno_df_4.GLOBALEVENTID.isin(sample_3_ids))].sample(sample_n)

samples = pd.concat([sample_1, sample_2, sample_3, sample_4], ignore_index=True)
len(samples)

### Step 5. Text Preprocessing
* Removing emoji
* Clean up publish date
* Format news article text for HTML display

In [None]:
samples['text'] = samples.text.apply(give_emoji_free_text)
samples['title'] = samples.title.apply(give_emoji_free_text)
samples['publish_date'] = samples.publish_date.apply(lambda x: x.replace('+00:00', '') if x and '+00:00' in x else x)
samples['article_interface'] = samples.text.apply(lambda x: ' '.join(['<p>'+i+'</p>' for i in x.split('\n') if i.strip()]))

In [None]:
samples['batch_idx'] = 5
samples = samples[['GLOBALEVENTID', 'title', 'publish_date', 'article_interface', 'batch_idx']]

In [None]:
# make sure no duplicated articles
len(set(samples.article_interface.to_list()))

In [None]:
# save
samples.to_csv('../dataframes/amt_full_task_batch_5.csv', index=False)