# Automatic Speech Recognition

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Installing the required libraries

In [2]:
!pip install pydub
!pip install SpeechRecognition
!pip install transformers
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Importing the libraries and the audio file from the directory

In [3]:
from pydub import AudioSegment

In [4]:
audio_path = '/content/drive/MyDrive/Datasets/Dimensionless/sales_call_telephone_marketers.wav'
sales_call_audio = AudioSegment.from_wav(audio_path)

## Performing ASR using Speech Recognition module (Task-1)

In [5]:
from pydub.silence import split_on_silence

# Set the minimum length of a segment in milliseconds
min_segment_length = 1000

# Set the minimum silence threshold in dBFS
min_silence_threshold = -39

# Split the audio into segments based on silence
segments = split_on_silence(sales_call_audio, min_silence_len=min_segment_length, silence_thresh=min_silence_threshold)

# Iterate over the segments and save each one as a separate WAV file
for i, segment in enumerate(segments):
    # Set the output file name and format
    output_file = "/content/drive/MyDrive/Datasets/Dimensionless/Alternate/"+f'line_{i+1}.wav'
    
    # Save the segment as a WAV file
    segment.export(output_file, format='wav')


In [6]:
import speech_recognition as sr
recognizer = sr.Recognizer()

In [7]:
transcript = []
for i in range(1,9):
    try:
        with sr.AudioFile(f"/content/drive/MyDrive/Datasets/Dimensionless/Alternate/line_{i}.wav") as source:
                temp_audio = recognizer.record(source) 
                text = recognizer.recognize_google(audio_data =temp_audio)
    except:
        from pydub.effects import normalize
        line = AudioSegment.from_file(f"/content/drive/MyDrive/Datasets/Dimensionless/Alternate/line_{i}.wav")
        normalize(line).export(f"/content/drive/MyDrive/Datasets/Dimensionless/Alternate/line{i}.wav", format = 'wav')
        with sr.AudioFile(f"/content/drive/MyDrive/Datasets/Dimensionless/Alternate/line{i}.wav") as source:
            temp_audio = recognizer.record(source) 
            text = recognizer.recognize_google(audio_data =temp_audio)

    transcript.append(text)

In [8]:
transcript

['hello hi Nancy this is Mike from AT&T incorpor',
 'yes how can I help',
 'Nancy you have been using our prepaid connection for a couple of years now right',
 "yeah that's",
 'how would you like a postpaid connection that allows you to make free unlimited voice calls to three AT&T numbers',
 "I would love that but what's the",
 "there's no catch there will be a monthly rental which you will have to pay like any other post-paid",
 'fantastic sign']

In [9]:
task_1_output = '. '.join(transcript)
task_1_output

"hello hi Nancy this is Mike from AT&T incorpor. yes how can I help. Nancy you have been using our prepaid connection for a couple of years now right. yeah that's. how would you like a postpaid connection that allows you to make free unlimited voice calls to three AT&T numbers. I would love that but what's the. there's no catch there will be a monthly rental which you will have to pay like any other post-paid. fantastic sign"

## Performing Entity and Intent Extraction (Task-2)

### Creating a Dataset of custom intents 

In [10]:
import torch
training_examples = [
        ( 'My name is Jeff and I am calling from Amazon',  'intro'),
        ( "What's the time",  'ask'),
        ( 'This is a call regarding your Google Cloud Platform account.',  'purpose'),
        ("Amazing, count me in",'positive'),
        ( 'I am calling about your Microsoft Azure subscription.',  'purpose'),
        ( 'I am calling from Tesla and my name is Elon.',  'intro'),
        ( 'I wanted to talk to you about your Spotify plan.',  'purpose'),
        ( "Have you been satisfied with our subscription",  'ask'),
        ("The connection allows you to use your internet  unlimited for the full month",'inform'),
        ( 'I am calling from Microsoft and my name is Satya.',  'intro'),
        ( 'I would like to talk about your Amazon Web Services account.','purpose'),
        ("Wow, That's great",'positive'),
        ( "How do you like your steak",  'ask'),
        ( 'I am Sundar and this is call from Google.',  'intro'),
        ("Your cloud account has an option of an upgrade",'inform'),
        ("there will be a monthly rental which you will have to pay like any other post-paid",'inform')
    ]
texts,labels = zip(*training_examples)
intent_labels = list(set(labels))
num_classes = len(intent_labels)
num_classes

5

### Downloading the model and it's tokenizer

In [11]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
intent_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Tokenizing the input texts and making a pytorch dataset for the model

In [12]:
inputs = intent_tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
classes = torch.tensor([intent_labels.index(label) for label in labels])
dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], classes)

### Training the model

In [13]:
optimizer = AdamW(intent_model.parameters(), lr=2e-5, eps=1e-8)
loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)
intent_model.train()
for epoch in range(3):
    for batch in loader:
        optimizer.zero_grad()
        input_ids, attention_mask, classes = batch
        outputs = intent_model(input_ids, attention_mask=attention_mask, labels=classes)
        loss = outputs[0]
        loss.backward()
        optimizer.step()



In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')
task_3_output = []
for text in transcript:
    output = dict()
    temp = dict()
    entity = []
    #Extracting the entities
    doc = nlp(text)
    for ent in doc.ents:
        temp[ent.label_] = ent.text
    entity.append(temp)
    #Extracting the intents
    intent_input = intent_tokenizer.encode_plus(text,add_special_tokens=True,return_tensors='pt')
    intent_logits = intent_model(**intent_input).logits
    intent_id = torch.argmax(intent_logits, dim=1).item()
    intent_label = intent_labels[intent_id]
    #Adding it all up
    output['sentence'] = text
    output['intent'] = intent_label
    output['entities'] = entity

    task_3_output.append(output)

In [15]:
task_3_output

[{'sentence': 'hello hi Nancy this is Mike from AT&T incorpor',
  'intent': 'intro',
  'entities': [{'PERSON': 'Mike', 'ORG': 'AT&T'}]},
 {'sentence': 'yes how can I help', 'intent': 'ask', 'entities': [{}]},
 {'sentence': 'Nancy you have been using our prepaid connection for a couple of years now right',
  'intent': 'intro',
  'entities': [{'PERSON': 'Nancy', 'DATE': 'a couple of years'}]},
 {'sentence': "yeah that's", 'intent': 'intro', 'entities': [{}]},
 {'sentence': 'how would you like a postpaid connection that allows you to make free unlimited voice calls to three AT&T numbers',
  'intent': 'ask',
  'entities': [{'CARDINAL': 'three', 'ORG': 'AT&T'}]},
 {'sentence': "I would love that but what's the",
  'intent': 'inform',
  'entities': [{}]},
 {'sentence': "there's no catch there will be a monthly rental which you will have to pay like any other post-paid",
  'intent': 'intro',
  'entities': [{'DATE': 'monthly'}]},
 {'sentence': 'fantastic sign', 'intent': 'purpose', 'entities':

## Generating the JSON File (Task-3)

In [16]:
final_output = dict()
final_output['task_1_output'] = task_1_output
final_output['task_3_output'] = task_3_output
import json
with open('/content/drive/MyDrive/Datasets/Dimensionless/final_output.json', 'w') as f:
    json.dump(final_output, f)

## Generating the summary report (Task-4)

In [17]:
!pip install sumy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.utils import get_stop_words

# Initialize the summarizer with LuhnSummarizer
summarizer = LuhnSummarizer()

# Set summarizer parameters
summarizer.stop_words = get_stop_words('english')
summarizer.reduction_ratio = 0.5

# Initialize parser and tokenizer
parser = PlaintextParser.from_string(task_1_output, Tokenizer('english'))

# Generate summary
summary = summarizer(parser.document, 4) # The second argument specifies the number of sentences in the summary

# Print the summary
for sentence in summary:
    print(sentence)

hello hi Nancy this is Mike from AT&T incorpor.
yes how can I help.
Nancy you have been using our prepaid connection for a couple of years now right.
yeah that's.
