In [None]:
!pip install deepgram-sdk==3.*
!pip install dataclasses typing-extensions verboselogs
!pip install python-dotenv
!pip install num2words
!pip install vaderSentiment

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting num2words
  Using cached num2words-0.5.13-py3-none-any.whl (143 kB)
Collecting docopt>=0.6.2 (from num2words)
  Using cached docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=3d153150df24f274c4d6f1b0940a61203b2663509f2bdb5c5d90f5adfed2495d
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.13
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━

In [None]:
# Copyright 2023-2024 Deepgram SDK contributors. All Rights Reserved.
# Use of this source code is governed by a MIT license that can be found in the LICENSE file.
# SPDX-License-Identifier: MIT


import os
import httpx
import json
import re
import logging, verboselogs
import pandas as pd
import nltk
from dotenv import load_dotenv
from datetime import datetime
from num2words import num2words
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    PrerecordedOptions,
    FileSource,
    )


# The list of common words such as "a", "an", etc.
nltk.download('stopwords')
# Download a collection of popular resources from the NLTK library
nltk.download('popular', quiet=True)
# Sentence tokenization (Splitting a text into individual senteces)
nltk.download('punkt')
# English vocabulary database
nltk.download('wordnet')
# Used for training language models or evaluating nlp algothms
nltk.download('brown')
# Used for performing sentiment analysis on text
nltk.download('vader_lexicon')


lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")


load_dotenv()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


False

In [None]:
def extract_transcript(AUDIO_FILE):
    try:
        # STEP 1 Create a Deepgram client using the API key in the environment variables
        config: DeepgramClientOptions = DeepgramClientOptions(
            verbose=logging.SPAM,
        )
        deepgram: DeepgramClient = DeepgramClient("03feccffc4af0aac76e14882c4b7e5b08b509b92", config)
        # OR use defaults
        # deepgram: DeepgramClient = DeepgramClient()

        # STEP 2 Call the transcribe_file method on the prerecorded class
        with open(AUDIO_FILE, "rb") as file:
            buffer_data = file.read()

        payload: FileSource = {
            "buffer": buffer_data,
        }

        options: PrerecordedOptions = PrerecordedOptions(
            model="nova",
            smart_format=True,
            utterances=True,
            punctuate=True,
            diarize=False,
            numerals=False,
        )

        before = datetime.now()
        response = deepgram.listen.prerecorded.v("1").transcribe_file(
            payload, options, timeout=httpx.Timeout(300.0, connect=10.0)
        )
        after = datetime.now()

        print(response.to_json(indent=4))
        audio_dir = os.path.dirname(AUDIO_FILE)

        # Construct the path for the JSON file in the same directory as the audio file
        json_filename = os.path.join(audio_dir, f"{AUDIO_FILE}.json")

        # Write the JSON data to the file in the same directory as the audio file
        with open(json_filename, "w") as json_file:
            json_file.write(response.to_json(indent=4))

        print(f"Transcript JSON file '{json_filename}' created successfully.")
        return json_filename

    except Exception as e:
        print(f"Exception: {e}")

In [None]:
# Extracting the transcript from the json file
# and saving it into a list
def json_to_list(transcription_file):
  sentences = []
  with open(transcription_file, "r") as file:
        data = json.load(file)
        result = data['results']['channels'][0]['alternatives'][0]['transcript']
        result = result.split('.')
        for sentence in result:
          sentences.append(sentence)
        return sentences


In [None]:
def clean(list):
  cleaned = []
  for i in list:
    sentence = sent_tokenize(i)
    for item in sentence:
      # Convert to lowercase
      text_lowercase = item.lower()
      # Remove punctuation
      text_without_punctuation = re.sub(r"[^\w\s]", "", text_lowercase)
      # Remove stopwords and stem words
      tokens = word_tokenize(text_without_punctuation)
      new_tokens = []
      for word in tokens:
        if word.isnumeric():
          word = num2words(word)
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
        elif word not in stop_words:
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
      # Join tokens back into a string
      cleaned_text = " ".join(new_tokens)
      cleaned.append(cleaned_text)
  return cleaned

In [None]:
# Use Vader library to get the polarity of the sentence
# Sentences with polarity more than zero are positive, whilst sentences with polarity less than zero are negative
# Sentences with zero polarity are neutral

def getPolarity(text):
    polarity = SentimentIntensityAnalyzer().polarity_scores(text)
    if polarity['compound'] > 0:
      sentiment = 'postive'
    elif polarity['compound'] < 0:
      sentiment = 'negative'
    else:
      sentiment = 'neutral'
    return sentiment,polarity

In [None]:
# https://www.youtube.com/watch?v=uvqDTbusdUU

audio_file1 = "/content/Artificial_intelligence_in_healthcare_opportunities_and_uvqDTbusdUU.mp3"

json_transcript1 = extract_transcript(audio_file1)

pod1 = json_to_list(json_transcript1)

cleaned1 = clean(pod1)

cleaned1

In [None]:
# https://www.youtube.com/watch?v=0xSSonMIqBk

audio_file2 = "/content/How_AI_can_make_health_care_better_0xSSonMIqBk_139.m4a"

json_transcript2 = extract_transcript(audio_file2)

pod2 = json_to_list(json_transcript2)

cleaned2 = clean(pod2)

cleaned2

In [None]:
# https://www.youtube.com/watch?v=H3MeGvtiwKc

audio_file3 = "/content/AI_in_Healthcare_Miracles_You_Won_t_Believe__H3MeGvtiwKc_140.m4a"

json_transcript3 = extract_transcript(audio_file3)

pod3 = json_to_list(json_transcript3)

cleaned3 = clean(pod3)

cleaned3

In [None]:
# https://www.youtube.com/watch?v=p92P5x-WfQg

audio_file4 = "/content/The_Truth_About_AI_and_the_Healthcare_Industry_ft_Cold_p92P5x_WfQg.m4a"

json_transcript4 = extract_transcript(audio_file4)

pod4 = json_to_list(json_transcript4)

cleaned4 = clean(pod4)

cleaned4

In [None]:
# https://www.youtube.com/watch?v=v_336X798aU

audio_file5 = "/content/10_Benefits_of_Artificial_intelligence_in_Healthcare_v_336X798aU.m4a"

json_transcript5 = extract_transcript(audio_file5)

pod5 = json_to_list(json_transcript5)

cleaned5 = clean(pod5)

cleaned5

In [None]:
# after adding all the podcasts

result = []

result.extend(cleaned1 + cleaned2 + cleaned3 + cleaned4 + cleaned5)

print (result)


['artificial intelligence often depicted villain robot ready take world im tell ai actually save life improve health care million patient around world', 'ai helping u personalize delivery care make hospital efficient improve access health care providing accurate decision making tool', 'ai process educating computer model using complex large data set', 'model learns data training process build ability make decision predict outcome presented new data', 'talking access computer model know based experience thousand patient whether treatment likely work work best patient based individual condition', 'two room fact anywhere world alike', 'ai model helping doctor learn patient similar condition even similar genetic information', 'make highly informed decision diagnosis treatment option', 'want talk starting use ai delivering care cancer patient', 'cancer diagnosis immensely complicated doctor making decision diagnosing primary secondary cancer well patient understanding risk success rate trea

In [None]:
data = []  # List to store dictionaries
for sentence in result:
  sentiment, polarity = getPolarity(sentence)
  new_item = {'sentence' : sentence, 'sentiment' : sentiment, 'category' : 'Healthcare'}    # add category here
  data.append(new_item)

df = pd.DataFrame(data)

In [None]:
# Make sure to change category
df.to_csv('Healthcare data (podcast-scraped).csv', index=False)
