<a href="https://colab.research.google.com/github/seimwiwa/playground_python/blob/main/20231216_podcast_analysis_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install
%%capture

!pip install pydub pyannote.audio umap-learn datashader hdbscan openai bertopic

In [3]:
#@title Import
%%capture

import torch, json, os, requests, openai, nltk
from google.colab import drive

import pandas as pd
import numpy as np

from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pyannote.audio import Model, Inference

import umap, umap.plot
from hdbscan import HDBSCAN
import plotly.express as px

from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import PartOfSpeech, OpenAI
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
#@title Conntect Google Drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
openai.api_key = ""

In [6]:
#@title Download Podcast Audio

def download_file(url, path):
    response = requests.get(url)
    with open(path, "wb") as file:
      file.write(response.content)

In [7]:
url = "https://content.libsyn.com/p/1/1/2/1122415982dc17f7/CIS5E1.mp3?c_id=162441116&response-content-disposition=attachment&cs_id=162441116&destination_id=1431812&response-content-type=audio%2Fmpeg&Expires=1702783225&Signature=GFJFKBKPvetBVrERXxz~5EGaCv210o~lOV4hll5tp8N87UeEDbHsTTqmSco3eJ5s1EgtaVX704IIvMYH7Im0V4gga-WIIcGruGMClwWhzS~4B1pLV-PM6DqaVvPTcxNAaXRsKCc-ZZWNmVcvSmaz34KFQQJjNf5XAa57U2RNsI83ozlV0FzgkxoHhvW2UNZj9wWtt1x-pmKfUiCvW3~Ejb0vu9bLGsr3reMJjx7l0npbb92GV5FwsKPctHcqyB0nYPG~69vwQN-ewRWJZYYDsJ0C8KnbrfPMGtBIwUO9G~xubdj5EMJ04vr3Gc2Gf~Jf6H7P5Y~hlgcdDJMHXYc9XQ__&Key-Pair-Id=K1YS7LZGUP96OI"
path = "/content/drive/MyDrive/(07) Colab/20231216_podcast_analysis/CIS5E1.mp3"

In [None]:
download_file(url, path)

In [8]:
#@title Prepare ASR Model
%%capture

model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3", torch_dtype = torch.float16)
model.to("cuda:0")

processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")

pipe = pipeline("automatic-speech-recognition", model = model, tokenizer = processor.tokenizer,
                feature_extractor = processor.feature_extractor, max_new_tokens = 128,
                ##Critial: chunk_length_s, stride_length_s
                chunk_length_s = 10, stride_length_s=(4, 2), batch_size = 16,
                return_timestamps = True, torch_dtype = torch.float16, device = "cuda:0")

In [9]:
#@title Get Scripts
%%capture
scripts = pipe(path)

In [10]:
audio = AudioSegment.from_mp3(path)
audio = audio.set_frame_rate(16000)

In [11]:
script_info = pd.DataFrame(scripts["chunks"])
script_info["id"] = script_info.index
script_info["started_at"] = script_info["timestamp"].apply(lambda x: x[0]*1000)
script_info["ended_at"] = script_info["timestamp"].apply(lambda x: x[1]*1000)
script_info["duration"] = script_info["ended_at"] - script_info["started_at"]
script_info["start"] = script_info["ended_at"].shift(periods = 1, fill_value = script_info["started_at"].min())
script_info["end"] = script_info["started_at"].shift(periods = -1, fill_value = script_info["ended_at"].max())
script_info["start"] = np.where(script_info["duration"]<500, script_info["start"]-500, script_info["start"])
script_info["end"] = np.where(script_info["duration"]<500, script_info["end"]+500, script_info["end"])
script_info = script_info.drop(["timestamp"], axis = 1)

In [12]:
folder = "/content/drive/MyDrive/(07) Colab/20231216_podcast_analysis/splited_audio/"

In [13]:
for i in script_info.index:
  start = script_info["start"][i]
  end = script_info["end"][i]
  audio[start:end].export(folder + "CIS5E1_" + str(i).zfill(4) + ".wav", format = "wav")

In [14]:
#@title Get Embedding Models
%%capture
embedding_model = Inference(Model.from_pretrained("pyannote/embedding", use_auth_token="XXXXXX"), window = "whole")

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`


In [15]:
files = os.listdir(folder)
files = sorted(set([folder + i for i in files if "CIS5E1" in i]))

script_info["path"] = files
script_info["embedding"] = script_info["path"].apply(embedding_model)

In [16]:
#@title Detect Speakers
umap_reducer = umap.UMAP(n_neighbors = 10, min_dist = 0.1, n_components = 2, metric="euclidean")
reduced_embeddings = umap_reducer.fit(np.array(script_info["embedding"].tolist()))

hdbscan = HDBSCAN(min_cluster_size = 20, gen_min_span_tree = True)

script_info["speaker"] = hdbscan.fit(reduced_embeddings.embedding_).labels_

In [17]:
umap_model = umap.UMAP(n_neighbors = 10, n_components = 5, min_dist=0.0, metric = "cosine")
hdbscan_model = HDBSCAN(min_cluster_size = 5, min_samples = 10, metric = "euclidean", cluster_selection_method = "eom", prediction_data = True)
nltk.download("stopwords")
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words = stopwords.words("english"))
ctfidf_model = ClassTfidfTransformer(bm25_weighting = True, reduce_frequent_words=True)

In [None]:
text = script_info.text.tolist()

topic_model = BERTopic(language = "english",
                       embedding_model = "all-mpnet-base-v2",
                       umap_model = umap_model,
                       hdbscan_model = hdbscan_model,
                       vectorizer_model = vectorizer_model,
                       ctfidf_model = ctfidf_model)

topic_model.fit(text)

topic_info = topic_model.get_document_info(text)

In [19]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,378,-1_two_dad_study_work,"[two, dad, study, work, years, today, two year...","[ Yeah, that's great. It's so neat to hear abo..."
1,0,81,0_data_databases_algorithm_communicate,"[data, databases, algorithm, communicate, quan...","[ more in the last 10 or plus years, you know,..."
2,1,40,1_better_got tremendously_tremendously good_go...,"[better, got tremendously, tremendously good, ...",[ are going to be stronger and better as time ...
3,2,36,2_methods_using_use methods_correctly,"[methods, using, use methods, correctly, using...","[ And then one day I woke up and saw, oh my go..."
4,3,29,3_lucy_ralph_sr_ralph jr,"[lucy, ralph, sr, ralph jr, ralph sr, jr, elli...",[ Ralph Jr.'s paper it's like you know cited p...
5,4,28,4_heart disease_disease_coronary heart disease...,"[heart disease, disease, coronary heart diseas...",[ new guidelines for a particular medication o...
6,5,27,5_audience_larger_applied audience_means,"[audience, larger, applied audience, means, ap...","[ you know, interaction of love, like understa..."
7,6,27,6_career_three_submitting_submitting program,"[career, three, submitting, submitting program...","[ bring, um, beginning of my career, as much a..."
8,7,26,7_computers_hide_dashboards_hide data,"[computers, hide, dashboards, hide data, quite...","[ Well, we should have real time dashboards fo..."
9,8,26,8_science_research_forward_version,"[science, research, forward, version, meta, ve...","[ and I think this could happen, but it's, I'm..."


In [20]:
hierarchical_topics = topic_model.hierarchical_topics(text)

100%|██████████| 24/24 [00:00<00:00, 181.52it/s]


In [21]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width = 800, height = 600, orientation = "left", top_n_topics = 30,
                                title = "Topics in This Episode")

In [22]:
print(topic_model.get_topic_tree(hierarchical_topics))

.
├─framingham_heart_heart disease_functions_disease
│    ├─heart_heart disease_disease_coronary heart_coronary heart disease
│    │    ├─■──heart study_heart study dad_thinking something_study even_thinking something like ── Topic: 22
│    │    └─■──heart disease_disease_coronary heart disease_coronary heart_coronary ── Topic: 4
│    └─framingham_functions_framingham functions_framingham study_developed
│         ├─■──framingham_framingham study_studies_contribution_function ── Topic: 17
│         └─■──functions_developed_framingham functions_framingham_called framingham ── Topic: 19
└─data_methods_lucy_propensity_inference
     ├─statistics medicine_statistics_professor_medicine_university
     │    ├─■──statistics medicine_statistics_medicine_epidemiology_biostatistics ── Topic: 15
     │    └─■──forest_got_wake_united_involved cancer ── Topic: 14
     └─data_methods_lucy_propensity_inference
          ├─data_databases_see_results_computers
          │    ├─data_databases_results_co

In [189]:
parent_topics = hierarchical_topics[4:].\
  filter(["Parent_ID", "Topics", "Distance"]).\
  explode("Topics").\
  sort_values(["Distance"], ascending = False).\
  groupby(["Topics"], as_index = False).\
  agg(parent_topic = ("Parent_ID", "first"))

parent_topics.columns = ["topic", "parent_topic"]

In [195]:
timeline = script_info.\
  filter(["id", "text", "start", "end", "duration", "speaker"]).\
  sort_values(["id"], ascending=True).\
  assign(topic = topic_info["Topic"]).\
  merge(parent_topics, on = "topic", how = "left").\
  assign(parent_topic = lambda x: np.where(pd.isnull(x.parent_topic), x.topic, x.parent_topic)).\
  query("topic != -1")

timeline["paragraph"] = ( (timeline["parent_topic"] != timeline["parent_topic"].shift()) | (timeline["speaker"] != timeline["speaker"].shift()) ).cumsum()

timeline = timeline.\
    sort_values(["paragraph"], ascending=True).\
    groupby(["paragraph", "speaker", "parent_topic", ], as_index=False).\
    agg(text = ("text", lambda x: " ".join(x)),
        started_at = ("start", "first"),
        ended_at = ("end", "last"),
        duration = ("duration", "sum")).\
    sort_values(["paragraph"], ascending=True)

timeline["started_at"] = pd.to_datetime(timeline["started_at"], utc = True, unit = "ms")
timeline["ended_at"] = pd.to_datetime(timeline["ended_at"], utc = True, unit = "ms")
timeline["speaker"] = timeline["speaker"].astype(str)

In [192]:
prompt = "Given the following contents: \n{text}\n, the simple and short with less than 10 word title of this is:"

def get_representation(x):
  input = prompt.format(text = x)
  response = openai.completions.create(model = "gpt-3.5-turbo-instruct-0914", prompt = input, max_tokens = 128, temperature = 0)
  result = response.choices[0].text
  return result

summary = timeline.\
  groupby(["parent_topic"], as_index = False).\
  agg(text = ("text", lambda x: " - " + "\n - ".join(x)))

summary["summary"] = summary["text"].apply(get_representation)
summary["summary"] = summary["summary"].str.replace('\n|"|^ ', "")

In [196]:
timeline = timeline.merge(summary.filter(["parent_topic", "summary"]), on = "parent_topic", how = "left")

In [197]:
timeline["ended_at"] = timeline["started_at"].shift(-1, fill_value = timeline["ended_at"].max())
timeline = timeline.sort_values(["speaker"]).reset_index(drop = True)

In [200]:
title = "Key Topics in Casual Inference S5 E1" + "<br><sup>This Episode is about: " + get_representation(" ".join(summary["summary"].tolist())) + "<br>Please visit: https://casualinfer.libsyn.com/ for more information</sup>"

In [201]:
px.timeline(timeline, x_start="started_at", x_end = "ended_at", y = "speaker", color = "summary", template = "plotly_white",
           color_discrete_sequence = px.colors.qualitative.Set2,
           width=1200, height=400,
           title = title).\
  update_layout(xaxis_title="Time",
                yaxis_title="Speaker",
                xaxis_tickformat = "%H:%M:%S",
                hovermode = "x unified",
                legend_title = "",
                legend = dict(x=-0.1, y=-1)).\
  update_traces(hovertemplate = "Speaker: %{y}")