# Training data organization

Library imports.

In [2]:
import json, random, pickle
import pandas as pd
from numpyencoder import NumpyEncoder
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
tqdm.pandas()

These paths can be replaced, accordingly to variations in data locations and/or names.

In [3]:
DATA_DIR = "data"
DATASET = "synthetic_dataset.json"

with open(f"{DATA_DIR}/{DATASET}", "r", encoding="utf-8") as archivo:
    data = json.load(archivo)

We organize the data into a pandas DataFrame to facilitate some operations.

In [4]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,original_words,modified_words,labels
0,"[@@PADDING@@, Source, Wikisource, librodot, co...","[@@PADDING@@, Source, Wikisource, librodot, co...","[$KEEP, $KEEP, $KEEP, $KEEP, $KEEP, $KEEP, $KE..."
1,"[@@PADDING@@, La, familia, Dashwood, llevaba, ...","[@@PADDING@@, La, familia, Dashwoodllevaba, la...","[$KEEP, $KEEP, $KEEP, $SPLIT_8, $KEEP, $KEEP, ..."
2,"[@@PADDING@@, Su, propiedad, era, de, buen, ta...","[@@PADDING@@, Su, propiedad, era, CON, buen, t...","[$KEEP, $KEEP, $KEEP, $KEEP, $REPLACE_de, $KEE..."
3,"[@@PADDING@@, El, último, dueño, de, esta, pro...","[@@PADDING@@, El, último, dueño, De, esta, pro...","[$KEEP, $KEEP, $KEEP, $KEEP, $REPLACE_de, $KEE..."
4,"[@@PADDING@@, Pero, la, muerte, de, ella, ocur...","[@@PADDING@@, Pero, LOS, muerte, de, ocurrida,...","[$KEEP, $KEEP, $REPLACE_la, $KEEP, $APPEND_ell..."


Also, we won't be using the `original_words` column, we can drop it.

In [5]:
df.drop(columns=['original_words'], inplace=True)

Using a sequence identifier so as not to lose them, we will place a word with its respective label in each column. This will make label encoding easier.

In [7]:
df['sentence_id'] = df.index
df_exploded = df.explode(['modified_words', 'labels'])
df_exploded = df_exploded.rename(columns={
    'modified_words': 'word',
    'labels': 'label'
})

df_exploded

Unnamed: 0,word,label,sentence_id
0,@@PADDING@@,$KEEP,0
0,Source,$KEEP,0
0,Wikisource,$KEEP,0
0,librodot,$KEEP,0
0,com,$KEEP,0
...,...,...,...
1171188,titulado,$KEEP,1171188
1171188,La,$KEEP,1171188
1171188,lucha,$KEEP,1171188
1171188,por,$KEEP,1171188


With `LabelEncoder` we will transform labels into `ner_tags`, an integer representation.

In [9]:
le = LabelEncoder()
df_exploded['ner_tag'] = le.fit_transform(df_exploded['label'])

with open(f"{DATA_DIR}/labelencoder.pkl", "wb") as pick:
    pickle.dump(le, pick)

In [10]:
df_exploded

Unnamed: 0,word,label,sentence_id,ner_tag
0,@@PADDING@@,$KEEP,0,1407
0,Source,$KEEP,0,1407
0,Wikisource,$KEEP,0,1407
0,librodot,$KEEP,0,1407
0,com,$KEEP,0,1407
...,...,...,...,...
1171188,titulado,$KEEP,1171188,1407
1171188,La,$KEEP,1171188,1407
1171188,lucha,$KEEP,1171188,1407
1171188,por,$KEEP,1171188,1407


Finally we put the sentences back together.

In [11]:
df_grouped = df_exploded.groupby('sentence_id').agg({
    'word': list,
    'ner_tag': list
}).reset_index()


df_grouped = df_grouped.rename(columns={
    'word': 'modified_words',
    'ner_tag': 'ner_tags'
})

df_grouped = df_grouped.drop(columns=['sentence_id'])

df_grouped

Unnamed: 0,modified_words,ner_tags
0,"[@@PADDING@@, Source, Wikisource, librodot, co...","[1407, 1407, 1407, 1407, 1407, 1407, 1407, 498..."
1,"[@@PADDING@@, La, familia, Dashwoodllevaba, la...","[1407, 1407, 1407, 4998, 1407, 1407, 997, 1407]"
2,"[@@PADDING@@, Su, propiedad, era, CON, buen, t...","[1407, 1407, 1407, 1407, 2383, 1407, 1407, 140..."
3,"[@@PADDING@@, El, último, dueño, De, esta, pro...","[1407, 1407, 1407, 1407, 2383, 1407, 4999, 140..."
4,"[@@PADDING@@, Pero, LOS, muerte, de, ocurrida,...","[1407, 1407, 3344, 1407, 991, 1407, 1407, 1407..."
...,...,...
1171184,"[@@PADDING@@, Esta, mío, entrevista, se, reali...","[1407, 1407, 1406, 1407, 1407, 1407, 2670, 140..."
1171185,"[@@PADDING@@, DIGITO, DIGITO]","[1407, 1407, 1407]"
1171186,"[@@PADDING@@, conmotivo, de, conferencia, esta...","[1407, 4993, 1105, 1407, 1406, 1407, 997, 1407..."
1171187,"[@@PADDING@@, En, Tu, primer, libro, hablaba, ...","[1407, 1407, 4533, 1407, 1345, 1407, 1407, 140..."


Now we shuffle the data randomly and separate it into training, validation, and testing in 70-20-10 proportions.

In [12]:
data_transformed = df_grouped.to_dict(orient='records')

In [13]:
random.shuffle(data_transformed)
train_n = int(0.7*len(data_transformed))
val_n = int(0.2*len(data_transformed))

training = data_transformed[:train_n]
validation = data_transformed[train_n:train_n+val_n]
testing = data_transformed[train_n+val_n:]

In [14]:
with open(f"{DATA_DIR}/train_data.json", "w") as output_data:
    json.dump(training, output_data, cls=NumpyEncoder)
    
with open(f"{DATA_DIR}/val_data.json", "w") as output_data:
    json.dump(validation, output_data, cls=NumpyEncoder)
    
with open(f"{DATA_DIR}/test_data.json", "w") as output_data:
    json.dump(testing, output_data, cls=NumpyEncoder)