In [None]:
!pip install deepgram-sdk==3.*
!pip install dataclasses typing-extensions verboselogs
!pip install python-dotenv
!pip install num2words
!pip install vaderSentiment

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting num2words
  Using cached num2words-0.5.13-py3-none-any.whl (143 kB)
Collecting docopt>=0.6.2 (from num2words)
  Using cached docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=70e9ab69041341fd91fe047ed98c14f9e603e8e75bf03c50416c00957763a890
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.13
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━

In [None]:
# Copyright 2023-2024 Deepgram SDK contributors. All Rights Reserved.
# Use of this source code is governed by a MIT license that can be found in the LICENSE file.
# SPDX-License-Identifier: MIT


import os
import httpx
import json
import re
import logging, verboselogs
import pandas as pd
import nltk
from dotenv import load_dotenv
from datetime import datetime
from num2words import num2words
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    PrerecordedOptions,
    FileSource,
    )


# The list of common words such as "a", "an", etc.
nltk.download('stopwords')
# Download a collection of popular resources from the NLTK library
nltk.download('popular', quiet=True)
# Sentence tokenization (Splitting a text into individual senteces)
nltk.download('punkt')
# English vocabulary database
nltk.download('wordnet')
# Used for training language models or evaluating nlp algothms
nltk.download('brown')
# Used for performing sentiment analysis on text
nltk.download('vader_lexicon')


lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")


load_dotenv()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


False

In [None]:
def extract_transcript(AUDIO_FILE):
    try:
        # STEP 1 Create a Deepgram client using the API key in the environment variables
        config: DeepgramClientOptions = DeepgramClientOptions(
            verbose=logging.SPAM,
        )
        deepgram: DeepgramClient = DeepgramClient("03feccffc4af0aac76e14882c4b7e5b08b509b92", config)
        # OR use defaults
        # deepgram: DeepgramClient = DeepgramClient()

        # STEP 2 Call the transcribe_file method on the prerecorded class
        with open(AUDIO_FILE, "rb") as file:
            buffer_data = file.read()

        payload: FileSource = {
            "buffer": buffer_data,
        }

        options: PrerecordedOptions = PrerecordedOptions(
            model="nova",
            smart_format=True,
            utterances=True,
            punctuate=True,
            diarize=False,
            numerals=False,
        )

        before = datetime.now()
        response = deepgram.listen.prerecorded.v("1").transcribe_file(
            payload, options, timeout=httpx.Timeout(300.0, connect=10.0)
        )
        after = datetime.now()

        print(response.to_json(indent=4))
        audio_dir = os.path.dirname(AUDIO_FILE)

        # Construct the path for the JSON file in the same directory as the audio file
        json_filename = os.path.join(audio_dir, f"{AUDIO_FILE}.json")

        # Write the JSON data to the file in the same directory as the audio file
        with open(json_filename, "w") as json_file:
            json_file.write(response.to_json(indent=4))

        print(f"Transcript JSON file '{json_filename}' created successfully.")
        return json_filename

    except Exception as e:
        print(f"Exception: {e}")

In [None]:
# Extracting the transcript from the json file
# and saving it into a list
def json_to_list(transcription_file):
  sentences = []
  with open(transcription_file, "r") as file:
        data = json.load(file)
        result = data['results']['channels'][0]['alternatives'][0]['transcript']
        result = result.split('.')
        for sentence in result:
          sentences.append(sentence)
        return sentences


In [None]:
def clean(list):
  cleaned = []
  for i in list:
    sentence = sent_tokenize(i)
    for item in sentence:
      # Convert to lowercase
      text_lowercase = item.lower()
      # Remove punctuation
      text_without_punctuation = re.sub(r"[^\w\s]", "", text_lowercase)
      # Remove stopwords and stem words
      tokens = word_tokenize(text_without_punctuation)
      new_tokens = []
      for word in tokens:
        if word.isnumeric():
          word = num2words(word)
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
        elif word not in stop_words:
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
      # Join tokens back into a string
      cleaned_text = " ".join(new_tokens)
      cleaned.append(cleaned_text)
  return cleaned

In [None]:
# Use Vader library to get the polarity of the sentence
# Sentences with polarity more than zero are positive, whilst sentences with polarity less than zero are negative
# Sentences with zero polarity are neutral

def getPolarity(text):
    polarity = SentimentIntensityAnalyzer().polarity_scores(text)
    if polarity['compound'] > 0:
      sentiment = 'postive'
    elif polarity['compound'] < 0:
      sentiment = 'negative'
    else:
      sentiment = 'neutral'
    return sentiment,polarity

In [None]:
# https://www.youtube.com/watch?v=w7xJ1kypQxs

audio_file1 = "/content/AI_is_Transforming_Airports_aviation_industry_Discoveri_w7xJ1kypQxs.m4a"

json_transcript1 = extract_transcript(audio_file1)

pod1 = json_to_list(json_transcript1)

cleaned1 = clean(pod1)

cleaned1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                "start": 144.85999,
                                "end": 145.35999,
                                "confidence": 0.9975586,
                                "punctuated_word": "social"
                            },
                            {
                                "word": "ills",
                                "start": 145.5,
                                "end": 146.0,
                                "confidence": 0.9758301,
                                "punctuated_word": "ills."
                            },
                            {
                                "word": "bringing",
                                "start": 147.185,
                                "end": 147.665,
                                "confidence": 0.7949219,
                                "punctuated_word": "Bringing"
                            },
                            {
      

In [None]:
# https://www.youtube.com/watch?v=mXriW9m8IUM

audio_file2 = "/content/The_Power_of_AI_in_Tourism_and_Travel_️_mXriW9m8IUM_139.m4a"

json_transcript2 = extract_transcript(audio_file2)

pod2 = json_to_list(json_transcript2)

cleaned2 = clean(pod2)

cleaned2

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                "end": 24.21,
                                "confidence": 0.9970703,
                                "punctuated_word": "This"
                            },
                            {
                                "word": "is",
                                "start": 24.21,
                                "end": 24.45,
                                "confidence": 1.0,
                                "punctuated_word": "is"
                            },
                            {
                                "word": "where",
                                "start": 24.45,
                                "end": 24.85,
                                "confidence": 0.99902344,
                                "punctuated_word": "where"
                            },
                            {
                                "word": "ai",
                                "start

In [None]:
# https://www.youtube.com/watch?v=usojobvpLx4

audio_file3 = "/content/AI_now_a_popular_tool_in_the_tourism_industry_usojobvpLx4_140.m4a"

json_transcript3 = extract_transcript(audio_file3)

pod3 = json_to_list(json_transcript3)

cleaned3 = clean(pod3)

cleaned3

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                        "punctuated_word": "off"
                    },
                    {
                        "word": "on",
                        "start": 127.18,
                        "end": 127.34,
                        "confidence": 0.9980469,
                        "punctuated_word": "on"
                    },
                    {
                        "word": "new",
                        "start": 127.34,
                        "end": 127.58,
                        "confidence": 0.98779297,
                        "punctuated_word": "new"
                    },
                    {
                        "word": "adventures",
                        "start": 127.58,
                        "end": 128.08,
                        "confidence": 0.99365234,
                        "punctuated_word": "adventures"
                    }
                ],
                "id": "a4d262fd-c8ea-4347-991

In [None]:
# https://www.youtube.com/watch?v=G09xrHQeMgA

audio_file4 = "/content/_53_20_AI_s_Impact_on_Transportation_Travel_and_Touris_G09xrHQeMgA.m4a"

json_transcript4 = extract_transcript(audio_file4)

pod4 = json_to_list(json_transcript4)

cleaned4 = clean(pod4)

cleaned4

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                    {
                        "word": "its",
                        "start": 238.845,
                        "end": 239.11,
                        "confidence": 0.9902344,
                        "punctuated_word": "its"
                    },
                    {
                        "word": "customer",
                        "start": 239.11,
                        "end": 239.61,
                        "confidence": 0.98339844,
                        "punctuated_word": "customer"
                    },
                    {
                        "word": "service",
                        "start": 239.67,
                        "end": 240.15,
                        "confidence": 0.96118164,
                        "punctuated_word": "service,"
                    },
                    {
                        "word": "but",
                        "start": 240.15,
                        "

In [None]:
# https://www.youtube.com/watch?v=W6jmzd_5jSw

audio_file5 = "/content/AI_in_TRAVEL_and_TOURISM_W6jmzd_5jSw_139.m4a"

json_transcript5 = extract_transcript(audio_file5)

pod5 = json_to_list(json_transcript5)

cleaned5 = clean(pod5)

cleaned5

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                "confidence": 1.0,
                                "punctuated_word": "provide"
                            },
                            {
                                "word": "real",
                                "start": 47.955,
                                "end": 48.195,
                                "confidence": 0.9995117,
                                "punctuated_word": "real"
                            },
                            {
                                "word": "time",
                                "start": 48.195,
                                "end": 48.515003,
                                "confidence": 0.99609375,
                                "punctuated_word": "time"
                            },
                            {
                                "word": "information",
                                "start": 48.515003,
           

In [None]:
# https://www.youtube.com/watch?v=TGFna9GvAp4

audio_file6 = "/content/Artificial_Intelligence_and_how_it_impacts_the_Travel_I_TGFna9GvAp4 (1).m4a"

json_transcript6 = extract_transcript(audio_file6)

pod6 = json_to_list(json_transcript6)

cleaned6 = clean(pod6)

cleaned6

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                        "start": 432.935,
                        "end": 433.255,
                        "confidence": 0.89501953,
                        "punctuated_word": "because"
                    },
                    {
                        "word": "it's",
                        "start": 433.255,
                        "end": 433.415,
                        "confidence": 0.99975586,
                        "punctuated_word": "it's"
                    },
                    {
                        "word": "not",
                        "start": 433.415,
                        "end": 433.575,
                        "confidence": 1.0,
                        "punctuated_word": "not"
                    },
                    {
                        "word": "in",
                        "start": 433.575,
                        "end": 433.735,
                        "confidence": 1.0,
                 

Transcript JSON file '/content/Artificial_Intelligence_and_how_it_impacts_the_Travel_I_TGFna9GvAp4 (1).m4a.json' created successfully.


['good morning',
 'im proud happy',
 'thank christian second time invitation',
 'whats minute',
 'yes',
 'ten minute could go whats exactly technically speaking ai would like two topic next ten minute',
 'first one regarding sustainability done stand regarding trouble right',
 'could solution resolving travel industry issue regarding facing sustainability globally global emission front u',
 'ronenberger world war travel last year worked summary study white book',
 'delivered could commitment industry shared built designed',
 'regarding address sustainability topic',
 'white book could download qr code',
 'could find website',
 'hope',
 'designed five strong commitment action',
 'first one cooperation within industry within industry also outside fund private equity invest transformation industry',
 'also course stake thats regulation could support investment sustainable aviation fuel transition relay state hospitality long list lever need transformation travel',
 'second one offer commi

In [None]:
# after adding all the podcasts

result = []

result.extend(cleaned1 + cleaned2 + cleaned3 + cleaned4 + cleaned5 + cleaned6)

print (result)


['delta collaborated u custom border protection', 'transportation security administration hartsfield jackson atlanta international airport create 1st biometric airport terminal already operational', 'dubai international airport also implemented similar system 1st business class passenger', 'system powered ai represent early stage transformation airport airline operation', 'eventual foray ai throughout passenger experience driven technological advancement need improvement compete autonomous vehicle', 'ai potential solve long standing issue air travel long line bad behavior', 'however also highlight concern privacy violation unintended consequence could arise implementation ai air travel', 'replacing glacial dehumanizing boarding processor', 'ai ability see understand complex data applied create better aircraft boarding process replacing current inefficient time consuming method like google use ai satellite data fishing fleet prevent illegal fishing computer vision powered ai could help 

In [None]:
data = []  # List to store dictionaries
for sentence in result:
  sentiment, polarity = getPolarity(sentence)
  new_item = {'sentence' : sentence, 'sentiment' : sentiment, 'category' : 'Travel'}    # add category here
  data.append(new_item)

df = pd.DataFrame(data)

In [None]:
# Make sure to change category
df.to_csv('Travel data (podcast-scraped).csv', index=False)
