## Setup

Importing dependencies, downloading openai-whisper, sentence-transformers, ffmpeg...

In [None]:
!pip install --quiet -U pip
!pip install --quiet -U openai-whisper
!pip install feedparser
!pip install --quiet -U sentence-transformers
!apt-get update && apt install -y ffmpeg

In [None]:
!pip uninstall whisper
!pip install --force-reinstall openai-whisper==20230124

In [None]:
import re
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math

import whisper

import feedparser
import urllib.request

### transcription model ###
model_size = "large-v2"

print('Loading transcription model...')
# Run on GPU with FP16
model = whisper.load_model(model_size) #, device="cuda")
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
print('Done.')

### paragraph model ###
print('Loading word embedding model...')
sentencemodel = SentenceTransformer('all-mpnet-base-v2')
print('Done.')

def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))
    
def activate_similarities(similarities:np.array, p_size=10)->np.array:
        """ Function returns list of weighted sums of activated sentence similarities
        Args:
            similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
            p_size (int): number of sentences are used to calculate weighted sum 
        Returns:
            list: list of weighted sums
        """
        # To create weights for sigmoid function we first have to create space. P_size will determine number of sentences used and the size of weights vector.
        x = np.linspace(-10,10,p_size)
        # Then we need to apply activation function to the created space
        y = np.vectorize(rev_sigmoid) 
        # Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
        activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
        ### 1. Take each diagonal to the right of the main diagonal
        diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
        ### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
        diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
        ### 3. Stack those diagonals into new matrix
        diagonals = np.stack(diagonals)
        ### 4. Apply activation weights to each row. Multiply similarities with our activation.
        diagonals = diagonals * activation_weights.reshape(-1,1)
        ### 5. Calculate the weighted sum of activated similarities
        activated_similarities = np.sum(diagonals, axis=0)
        return activated_similarities

def paragraphise(bigstring):
    sentences = bigstring.split('. ')
    
    embeddings = sentencemodel.encode(sentences, show_progress_bar=False)
    similarities = cosine_similarity(embeddings)
    activated_similarities = activate_similarities(similarities, p_size=5)
    minimas = argrelextrema(activated_similarities, np.less, order=2) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.

    split_points = [each for each in minimas[0]]
    text = ''
    for num,each in enumerate(sentences):
        if num in split_points:
            text+=f'\n\n {each}. '
        else:
            text+=f'{each}. '
    return text

### other functions ### 
def zfill_alternative(x,l=2): return x if len(x) >= l else '0'*(l-len(x))+x

## Transcribe and Prettify Podcast from RSS feed

The very clever idea for making paragraphs was taken from [this notebook](https://github.com/poloniki/quint/blob/master/notebooks/Chunking%20text%20into%20paragraphs.ipynb).

The example podcast used in the notebook is a Russian book podcast [Knizhnyy Bazar](https://tehnikarechi.studio/podcasts/knizhnyy-bazar).



In [None]:
import feedparser
import urllib.request
NewsFeed = feedparser.parse("https://tehnikarechi.studio/api/rss/podcasts/knizhnyy-bazar")
entry = NewsFeed.entries[0]

print(entry.keys())

In [None]:
!mkdir knizhnyy-bazar

In [None]:
NewsFeed = feedparser.parse("https://tehnikarechi.studio/api/rss/podcasts/knizhnyy-bazar")

dl_dir = 'knizhnyy-bazar'

failsforsomereason = []

arr = os.listdir('/kaggle/working/'+dl_dir)

for entry in tqdm(NewsFeed.entries):
    notdone = True
    while notdone:
        try:
            # Get the title and discard if already done.
            vid_title = re.sub(r'[^A-Za-zЁёА-я0-9— ]+', '', entry['title'].replace(u'\xa0', u' '))[:130]

            if (vid_title in [x[:-4] for x in arr] or 
                vid_title in failsforsomereason):
                notdone = False
                continue
                
            # Download audio file. 
            opener = urllib.request.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            urllib.request.install_opener(opener)
            urllib.request.urlretrieve(entry['links'][1]['href'], "audio.mp3")
            print(vid_title)

            # Transcribe. 
            transcription = model.transcribe("audio.mp3",  language="ru")
            
            # Paragraphise the string. 
            prettytext = paragraphise(transcription['text'])

            # Finally, save along with some metadata. 
            with open(dl_dir+'/'+vid_title+'.txt','w+',encoding='utf-8') as myfile:
                myfile.write('---')
                myfile.write('\ntitle: '+vid_title)
                myfile.write('\nauthor: '+entry['author'])
                myfile.write('\npublished: '+entry['published'])
                myfile.write('\ntags: '+str([ a['term'] for a in entry['tags']]))
                myfile.write('\n---\n\n')
                myfile.write(prettytext)
            
            print('Done')
            notdone = False
        except Exception as e:
            continue

In [None]:
!tar -zcvf knizhnyy-bazar.tar.gz knizhnyy-bazar/*.txt

In [None]:
from IPython.display import FileLink
FileLink(r'knizhnyy-bazar.tar.gz')