## Stage 4: Data Preprocessing

### 1. Objective
We prepare the data to be feed to the training stage

### 2. Approach
Suing the library tensorflow, we preprocess the dataset.

### 3. Implementation

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from src.features.padding import padding_func
import os

Like we mentioned in the previous stage, we're going to use 'Observaciones' for sentences and 'Especialidad' for labels.

In [3]:
# Load dataset
current_dir = os.getcwd()
dataset_path = os.path.join(current_dir, '..', 'data/raw', 'close.csv')
data_raw = pd.read_csv(dataset_path, low_memory=False, usecols=['Observaciones', 'Especialidad'])
# Basic statistics
print(data_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75866 entries, 0 to 75865
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Observaciones  75866 non-null  object
 1   Especialidad   75866 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB
None


The labels with more than 300 entries stay as they are, and the rest are grouped in 'OTHERS'.

In [4]:
top_label = data_raw['Especialidad'].value_counts().index.where(data_raw['Especialidad'].value_counts() >= 300)
data_raw['Especialidad_groped'] = data_raw['Especialidad'].where(data_raw['Especialidad'].isin(top_label), 'OTHERS')
print(data_raw['Especialidad_groped'].value_counts())

Especialidad_groped
8 - MECANICA                       24477
1 - ELECTRICIDAD                   16674
12 - ELECTROMEDICINA                9048
5 - FONTANERIA                      8910
4 - CARPINTERIA                     5318
3 - CALEFACCIÓN Y CLIMATIZACIÓN     3801
19 - APOYO NO ESPECIALIZADO         3089
7 - ALBAÑILERIA                     1704
17 - VARIAS ESPECIALIDADES          1140
6 - PINTURA Y REVESTIMIENTOS         891
OTHERS                               435
13 - RED DE VOZ Y DATOS              379
Name: count, dtype: int64


Split the data in training and test

In [5]:
training_data = data_raw.sample(frac=0.8, random_state=42)
test_data = data_raw.drop(training_data.index)
training_text, training_label = training_data['Observaciones'], training_data['Especialidad_groped']
test_text, test_label = test_data['Observaciones'], test_data['Especialidad_groped']
training_text.to_csv(os.path.join(current_dir, '..', 'data/processed', 'training_text.csv'), index=False)
test_text.to_csv(os.path.join(current_dir, '..', 'data/processed', 'test_text.csv'), index=False)
training_label.to_csv(os.path.join(current_dir, '..', 'data/processed', 'training_label.csv'), index=False)
test_label.to_csv(os.path.join(current_dir, '..', 'data/processed', 'test_label.csv'), index=False)
print(f"Training data size: {len(training_data)}")
print(f"Test data size: {len(test_data)}")

Training data size: 60693
Test data size: 15173


Check how many entries for each label in test and training.

In [7]:
# Check how many entries for each label in test and training in two column to compare
label_counts = pd.DataFrame({
    "Training": training_label.value_counts(),
    "Test": test_label.value_counts()
}).fillna(0)

print(label_counts)

                                 Training  Test
Especialidad_groped                            
8 - MECANICA                        19607  4870
1 - ELECTRICIDAD                    13332  3342
12 - ELECTROMEDICINA                 7195  1853
5 - FONTANERIA                       7129  1781
4 - CARPINTERIA                      4233  1085
3 - CALEFACCIÓN Y CLIMATIZACIÓN      3039   762
19 - APOYO NO ESPECIALIZADO          2480   609
7 - ALBAÑILERIA                      1370   334
17 - VARIAS ESPECIALIDADES            915   225
6 - PINTURA Y REVESTIMIENTOS          733   158
OTHERS                                356    79
13 - RED DE VOZ Y DATOS               304    75


Let's start vectorization with the training dataset.

In [60]:
# Select sentences for vectorization
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)
vectorize_layer.adapt(training_text)
vocabulary = vectorize_layer.get_vocabulary()
# Display the first 10 words in the vocabulary
for index, word in enumerate(vocabulary[:10]):
    print(index, word)
print(f'Vocabulary size: {len(vocabulary)}')

0 
1 [UNK]
2 de
3 la
4 en
5 gracias
6 y
7 el
8 del
9 no
Vocabulary size: 22386


The total of different words of vocabulary is 22386. To simplify the training and model, we're going to limit the vocabulary to 20000.

In [61]:
# Select sentences for vectorization
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True, max_tokens=20000)
vectorize_layer.adapt(training_text)
vocabulary = vectorize_layer.get_vocabulary()
# Display the first 10 words in the vocabulary
print(f'Vocabulary size: {len(vocabulary)}')

Vocabulary size: 20000


In [62]:
train_sequences = padding_func(vectorize_layer(training_text))
test_sequences = padding_func(vectorize_layer(test_text))
print(f'Train sequences shape: {train_sequences}')


Train sequences shape: <_TensorSliceDataset element_spec=TensorSpec(shape=(120,), dtype=tf.int32, name=None)>


Let's vectorized the labels too. One-hot labeling it's gonna be useful later for F1Score metrics.

In [None]:
# Preprocess the label dataset: convert string labels to integer indices and one-hot encode

label_names = training_label.unique().tolist()
label_to_index = {name: idx for idx, name in enumerate(label_names)}
# Save the label mapping
label_mapping_path = os.path.join(current_dir, '..', 'data/processed', 'label_mapping.csv')
pd.DataFrame(list(label_to_index.items()), columns=['label', 'index']).to_csv(label_mapping_path, index=False)

y_train_int = training_label.map(label_to_index).values
y_test_int = test_label.map(label_to_index).values

# # One-hot encode the integer labels
y_train = tf.keras.utils.to_categorical(y_train_int, num_classes=len(label_names))
y_test = tf.keras.utils.to_categorical(y_test_int, num_classes=len(label_names))

print(f"Example label mapping: {list(label_to_index.items())}")
print(f"First 5 y_train (one-hot): {y_train[:5]}")

Example label mapping: [('12 - ELECTROMEDICINA', 0), ('1 - ELECTRICIDAD', 1), ('8 - MECANICA', 2), ('3 - CALEFACCIÓN Y CLIMATIZACIÓN', 3), ('7 - ALBAÑILERIA', 4), ('5 - FONTANERIA', 5), ('19 - APOYO NO ESPECIALIZADO', 6), ('4 - CARPINTERIA', 7), ('17 - VARIAS ESPECIALIDADES', 8), ('OTHERS', 9), ('13 - RED DE VOZ Y DATOS', 10), ('6 - PINTURA Y REVESTIMIENTOS', 11)]
First 5 y_train (one-hot): [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


Putting together the text vector with the label index

In [64]:
# Create tf.data.Dataset objects from RaggedTensors and label tensors
train_dataset_vectorized = tf.data.Dataset.zip((train_sequences, tf.data.Dataset.from_tensor_slices(y_train)))
test_dataset_vectorized = tf.data.Dataset.zip((test_sequences, tf.data.Dataset.from_tensor_slices(y_test)))

print(train_dataset_vectorized)

<_ZipDataset element_spec=(TensorSpec(shape=(120,), dtype=tf.int32, name=None), TensorSpec(shape=(12,), dtype=tf.float64, name=None))>


In [65]:
# Prepare the datasets for training and testing
SHUFFLE_BUFFER_SIZE = 1000
PREFETCH_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

train_dataset = (train_dataset_vectorized
                 .cache()
                 .shuffle(SHUFFLE_BUFFER_SIZE)
                 .batch(BATCH_SIZE)
                 .prefetch(PREFETCH_SIZE)
)
test_dataset = (test_dataset_vectorized
                .cache()
                .batch(BATCH_SIZE)
                .prefetch(PREFETCH_SIZE))

Save the tf datasets and labels

In [66]:
import shutil

# Remove existing folders if they exist
train_dataset_path = os.path.join(current_dir, '..', 'data/processed', 'train_dataset')
test_dataset_path = os.path.join(current_dir, '..', 'data/processed', 'test_dataset')

if os.path.exists(train_dataset_path):
    shutil.rmtree(train_dataset_path)
if os.path.exists(test_dataset_path):
    shutil.rmtree(test_dataset_path)

# Save tf.data.Dataset objects
tf.data.Dataset.save(train_dataset, train_dataset_path)
tf.data.Dataset.save(test_dataset, test_dataset_path)

print("Datasets and label arrays saved successfully.")

Datasets and label arrays saved successfully.


### 4. Results

### 5. Challenges

### 6. Next Steps