 # Audio Transcript Analysis

## Install Necessary Packages

In [15]:
!pip install --quiet jiwer levenshtein torch transformers scikit-learn

You should consider upgrading via the '/Users/shilpakjose/Desktop/manu/OpenSource/AudioTranscriptAnalysis/venv/bin/python -m pip install --upgrade pip' command.[0m


## Import Necessary Packages

In [39]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import jaccard_score
import Levenshtein
import jiwer

## Set up a helper function to create string vector based on Bert Embedding

In [24]:
def get_bert_embedding(sentence, tokenizer, model):
    """
    Generate BERT embeddings for a sentence.
    """
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the CLS token's embedding as the sentence embedding
    return  outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def generate_binary_vec(cls_emb, threshold=0):
    return (cls_emb > threshold).astype(int)

    

## Initialise Model and Tokenizer

In [26]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

## Calculate Cosine Similarity

In [29]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Generate embeddings
embedding1 = get_bert_embedding(string1, tokenizer, model)
embedding2 = get_bert_embedding(string2, tokenizer, model)

# Calculate JCosine Similarity
cos_sim = cosine_similarity([embedding1], [embedding2])
print(f"Cosine Similarity: {cos_sim[0][0]}")

Cosine Similarity: 0.8974617719650269


## Calculate Jaccard Score

In [30]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Generate embeddings
embedding1 = get_bert_embedding(string1, tokenizer, model)
embedding2 = get_bert_embedding(string2, tokenizer, model)

# Calculate Jaccard Index
bin_embedding1 = generate_binary_vec(embedding1)
bin_embedding2 = generate_binary_vec(embedding2)
jaccard = jaccard_score(bin_embedding1, bin_embedding2)
print(f"Jaccard Index: {jaccard}")

Jaccard Index: 0.7207207207207207


## Calculate Levenshtein Distance

In [34]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Calculate Levenshtein
lev_distance = Levenshtein.distance(string1, string2)
print(f"Levenshtein Distance: {lev_distance}")

Levenshtein Distance: 47


## Calculate Euclidean Distance

In [37]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Generate embeddings
embedding1 = get_bert_embedding(string1, tokenizer, model)
embedding2 = get_bert_embedding(string2, tokenizer, model)

# Calculate JCosine Similarity
ed = euclidean_distances([embedding1], [embedding2])
print(f"Euclidean Distance: {ed[0][0]}")

Euclidean Distance: 7.1788649559021


## Calculate Word Error Rate

In [41]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Generate embeddings
wer_val = jiwer.wer(string1, string2)
print(f"WER: {wer_val}")

WER: 1.1666666666666667


## Calculate Character Error Rate

In [42]:
# Input strings
string1 = "Artificial intelligence is transforming the world."
string2 = "The world is being transformed by artificial intelligence."

# Generate embeddings
cer_val = jiwer.cer(string1, string2)
print(f"CER: {wer_val}")

CER: 1.1666666666666667
