In [150]:
import json

with open('./chall_ml4sci_1.0.json') as file:
    data = json.load(file)

for record in data["records"]:
    for word in record["words"]:
        sttWordString = word["stt_word"]
        label = 1 if "@g" in word["human_word"] else 0
        word["stt_word"] = {sttWordString: label}

# Print or process the modified data
for record in data["records"]:
    print(record)


{'human_transcript': 'hello my name is casper what is your name', 'stt_transcript': 'how my name is casper what is your name', 'words': [{'human_word': 'hello', 'stt_word': {'how': 0}}, {'human_word': 'my', 'stt_word': {'my': 0}}, {'human_word': 'name', 'stt_word': {'name': 0}}, {'human_word': 'is', 'stt_word': {'is': 0}}, {'human_word': 'casper', 'stt_word': {'casper': 0}}, {'human_word': 'what', 'stt_word': {'what': 0}}, {'human_word': 'is', 'stt_word': {'is': 0}}, {'human_word': 'your', 'stt_word': {'your': 0}}, {'human_word': 'name', 'stt_word': {'name': 0}}]}
{'human_transcript': 'hello my name is pi pikachu', 'stt_transcript': 'lo i name is  picacho', 'words': [{'human_word': 'hello', 'stt_word': {'lo': 0}}, {'human_word': 'my', 'stt_word': {'i': 0}}, {'human_word': 'name', 'stt_word': {'name': 0}}, {'human_word': 'is', 'stt_word': {'is': 0}}, {'human_word': 'pi', 'stt_word': {'': 0}}, {'human_word': 'pikachu', 'stt_word': {'picacho': 0}}]}
{'human_transcript': 'a very nice name 

## Split the data

In [151]:
total_records = len(data["records"])
train_size = int(0.50 * total_records)  # 50% of the total records
valid_size = int(0.20 * total_records)  # 20% of the total records
# Remaining 30% will be for test

train_data = data["records"][:train_size]
valid_data = data["records"][train_size:train_size + valid_size]
test_data = data["records"][train_size + valid_size:]


In [152]:
from transformers import BertTokenizer, BertModel
import torch

# 1. Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Transforming words of each sentence into tokens

In [153]:
def process_transcript(record):
    # Tokenize the sentence from one STT transcript
    sentence = record["stt_transcript"]
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    
    # Get embeddings from BERT
    with torch.no_grad():
        outputs = model_bert(**inputs)
    embeddings = outputs.last_hidden_state[0]

    # Iterate through each word in the record and initialize with zero tensor
    for word in record["words"]:
        zero_tensor = torch.zeros(embeddings.shape[1])  # Shape 1 of embeddings is the size of each token embedding
        word["embedding"] = zero_tensor

    # Convert tokenized input IDs back to tokens
    tokenized_text = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Iterate through each token to find matching words
    word_index = 0
    for i, token in enumerate(tokenized_text):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        # Check if the token matches the next word in the STT transcript
        if word_index < len(record["words"]):
            stt_word = list(record["words"][word_index]["stt_word"].keys())[0]
            if token.startswith(stt_word) or stt_word.startswith(token):  # Modify this condition as needed
                record["words"][word_index]["embedding"] = embeddings[i]
                word_index += 1

    return record



In [161]:
# Slicing the first 10 records from train_data
first_record = train_data[:10]
#print(first_record)

processed_first_record = []
for proc in first_record:
    processed_first_record.append(process_transcript(proc))

print(processed_first_record)

[{'human_transcript': 'hello my name is casper what is your name', 'stt_transcript': 'how my name is casper what is your name', 'words': [{'human_word': 'hello', 'stt_word': {'how': 0}, 'embedding': tensor([-2.3495e-01,  1.2772e+00,  4.3811e-04, -7.5532e-02,  4.0551e-01,
        -2.8312e-01,  1.5424e-01,  5.4650e-01,  1.7593e-01, -1.0740e+00,
        -7.5207e-02, -7.0743e-02,  4.4512e-01,  9.1760e-01, -2.4461e-01,
         3.6412e-01,  1.2853e-01,  4.2035e-01,  6.1335e-01,  4.1836e-01,
         4.8903e-01,  9.5272e-01,  2.8364e-01,  1.1263e-01,  7.1641e-02,
        -1.6273e-01, -6.9284e-01, -4.4212e-01,  8.3198e-01, -3.6214e-01,
         2.0518e-01, -3.1297e-01,  8.0007e-01,  1.7058e-01, -5.7679e-01,
        -6.9593e-01, -7.2059e-02, -1.6588e-01, -1.9250e-01,  1.0275e+00,
        -5.2162e-02, -8.5247e-01,  2.5862e-01, -3.8983e-01,  1.0031e+00,
        -3.3594e-01, -6.6188e-01, -4.3080e-01,  7.1443e-01, -9.5657e-01,
        -1.1790e-01, -6.8044e-01, -4.6892e-02,  2.1364e-02,  3.3837e-01

## MLP part

## Binary classifier with flexible parameters

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class FlexibleBinaryClassifier(nn.Module):
    def __init__(self, input_features, hidden_layers, neurons_per_layer):
        super(FlexibleBinaryClassifier, self).__init__()
        

        layers = []
        
        
        layers.append(nn.Linear(input_features, neurons_per_layer))
        layers.append(nn.ReLU())

       
        for _ in range(hidden_layers - 1):
            layers.append(nn.Linear(neurons_per_layer, neurons_per_layer))
            layers.append(nn.ReLU())

        
        layers.append(nn.Linear(neurons_per_layer, 1))
        layers.append(nn.Sigmoid())

       
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

## Function that trains the model and returns the loss for each epoch

In [None]:
def train_model(model, criterion, optimizer, train_loader, epochs=10):
    model.train()  

    for epoch in range(epochs):
        total_loss = 0

        for inputs, labels in train_loader:
            
            optimizer.zero_grad()

            
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

    return avg_loss


## Create a train loader

In [None]:
from torch.utils.data import Dataset, DataLoader

embeddings = []
labels = []

# Assuming processed_first_record is a list of records
for record in processed_first_record:
    for word in record['words']:
        if 'embedding' in word:
            embeddings.append(word['embedding'])
            labels.append(list(word['stt_word'].values())[0])  # Assuming each stt_word dictionary has one key-value pair

# Convert to tensors
embeddings_tensor = torch.stack(embeddings)
labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)  # Reshape for BCELoss




class CustomDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]


# Convert to tensors
embeddings_tensor = torch.stack(embeddings)
labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

# Create an instance of the CustomDataset
dataset = CustomDataset(embeddings_tensor, labels_tensor)




# Define batch size
batch_size = 32  # You can adjust this based on your specific needs and hardware capabilities

# Create the DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)




## Create a grid to see what are the bests parameters

In [None]:
number_epochs=10
input_features_options = [256, 512, 768]  # Example values
hidden_layers_options = [1, 2, 3]
neurons_per_layer_options = [64, 128, 256]
learning_rate_options = [0.001, 0.01, 0.1]

## Iterate over the grid to find best Parameters, with smallest loss

In [None]:
import itertools


grid = itertools.product(input_features_options, hidden_layers_options, neurons_per_layer_options, learning_rate_options)


parameter_combinations = list(grid)



best_loss = float('inf')
best_params = None

for combination in parameter_combinations:
    input_features, hidden_layers, neurons_per_layer, learning_rate = combination
    
    # Initialize model, criterion, optimizer, and data loader
    model = FlexibleBinaryClassifier(input_features, hidden_layers, neurons_per_layer)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)


    # Train the model
    loss = train_model(model, criterion, optimizer, train_loader, epochs=number_epochs)

    
    if loss < best_loss:
        best_loss = loss
        best_params = combination


print(f"Best Loss: {best_loss}")
print(f"Best Hyperparameters: Input Features: {best_params[0]}, Hidden Layers: {best_params[1]}, Neurons per Layer: {best_params[2]}, Learning Rate: {best_params[3]}")

## Use Best parameters on TEST data

## Dont look after...

## 

In [155]:

# Initialize the model
model = FlexibleBinaryClassifier(input_features=768, hidden_layers=2, neurons_per_layer=128)


# Loss and optimizer,
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

#^^^^^^^^^^^^^^

# Prepare data (embeddings and labels)
embeddings = []
labels = []

# Assuming processed_first_record is a list of records
for record in processed_first_record:
    for word in record['words']:
        if 'embedding' in word:
            embeddings.append(word['embedding'])
            labels.append(list(word['stt_word'].values())[0])  # Assuming each stt_word dictionary has one key-value pair

# Convert to tensors
embeddings_tensor = torch.stack(embeddings)
labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)  # Reshape for BCELoss

#^^^^^^^^^^^^^^^

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    # Forward pass
    outputs = model_binary(embeddings_tensor)
    loss = criterion(outputs, labels_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 2 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')



Epoch [2/20], Loss: 0.5992600917816162
Epoch [4/20], Loss: 0.3117891848087311
Epoch [6/20], Loss: 0.16402478516101837
Epoch [8/20], Loss: 0.09160909056663513
Epoch [10/20], Loss: 0.05627832189202309
Epoch [12/20], Loss: 0.03814607486128807
Epoch [14/20], Loss: 0.02823498658835888
Epoch [16/20], Loss: 0.022496888414025307
Epoch [18/20], Loss: 0.019002173095941544
Epoch [20/20], Loss: 0.016778459772467613


## Tests

In [157]:
processed_test_records = [process_transcript(record) for record in test_data[:10]]

test_embeddings = []
test_labels = []

for record in processed_test_records:
    for word in record['words']:
        if 'embedding' in word:
            test_embeddings.append(word['embedding'])
            test_labels.append(list(word['stt_word'].values())[0])

test_embeddings_tensor = torch.stack(test_embeddings)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.float32).unsqueeze(1)


In [159]:
model_binary.eval()  # Set the model to evaluation mode

with torch.no_grad():
    test_outputs = model(test_embeddings_tensor)
    test_predictions = (test_outputs > 0.5).float()  # Convert probabilities to binary predictions


In [160]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert tensors to numpy arrays for metric calculation
test_predictions_np = test_predictions.numpy().flatten()
test_labels_np = test_labels_tensor.numpy().flatten()

accuracy = accuracy_score(test_labels_np, test_predictions_np)
precision = precision_score(test_labels_np, test_predictions_np)
recall = recall_score(test_labels_np, test_predictions_np)
f1 = f1_score(test_labels_np, test_predictions_np)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.9024390243902439
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

def get_bert_embeddings(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

    # Get embeddings from BERT
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    return embeddings, inputs


In [None]:
def associate_embeddings_to_words(sentence, embeddings, inputs):
    tokenized_text = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    word_embeddings = []
    for word in sentence.split():
        word_tokens = tokenizer.tokenize(word)
        word_embedding = torch.zeros_like(embeddings[0][0])
        for token in word_tokens:
            idx = tokenized_text.index(token)
            word_embedding += embeddings[0][idx]
        word_embeddings.append(word_embedding / len(word_tokens))
    return word_embeddings


In [None]:
# Assuming `stt_transcripts` is a list of sentences and `labels` is a list of labels for each word
features = []
targets = []

for sentence, label_list in zip(stt_transcripts, labels):
    embeddings, inputs = get_bert_embeddings(sentence)
    word_embeddings = associate_embeddings_to_words(sentence, embeddings, inputs)
    features.extend(word_embeddings)
    targets.extend(label_list)


NameError: name 'stt_transcripts' is not defined

In [None]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        # Define layers
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.sigmoid(x)
        return self.fc2(x)

# Instantiate the model
input_size = 768  # Size of BERT embeddings
model = MLP(input_size)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
# Convert features and targets to torch tensors
features_tensor = torch.stack(features)
targets_tensor = torch.tensor(targets, dtype=torch.float32)

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(features_tensor)
    loss = criterion(outputs.squeeze(), targets_tensor)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item()}")


RuntimeError: stack expects a non-empty TensorList