In [1]:
!pip install --quiet -U pip
!pip install --quiet -U openai-whisper
!pip install feedparser
!pip install --quiet -U sentence-transformers
!apt-get update && apt install -y ffmpeg

[0mCollecting feedparser
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25ldone
[?25h  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6066 sha256=0a5a9714d2f66fee59e1b3c09368bd7aca20db1037dabc6d8ac4d135a0734320
  Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.10 sgmllib3k-1.0.0
Get:1 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [5002 B]
Get:2 https://packages.cloud.google.com/apt cloud-sdk InRelease

In [4]:
!pip uninstall whisper
!pip install --force-reinstall openai-whisper==20230124

[0mCollecting openai-whisper==20230124
  Downloading openai-whisper-20230124.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy (from openai-whisper==20230124)
  Downloading numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torch (from openai-whisper==20230124)
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tqdm (from openai-whisper==20230124)
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [6]:
import re
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math

import whisper

import feedparser
import urllib.request

### transcription model ###
model_size = "large-v2"

print('Loading transcription model...')
# Run on GPU with FP16
model = whisper.load_model(model_size) #, device="cuda")
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
print('Done.')

### paragraph model ###
print('Loading word embedding model...')
sentencemodel = SentenceTransformer('all-mpnet-base-v2')
print('Done.')

def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))
    
def activate_similarities(similarities:np.array, p_size=10)->np.array:
        """ Function returns list of weighted sums of activated sentence similarities
        Args:
            similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
            p_size (int): number of sentences are used to calculate weighted sum 
        Returns:
            list: list of weighted sums
        """
        # To create weights for sigmoid function we first have to create space. P_size will determine number of sentences used and the size of weights vector.
        x = np.linspace(-10,10,p_size)
        # Then we need to apply activation function to the created space
        y = np.vectorize(rev_sigmoid) 
        # Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
        activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
        ### 1. Take each diagonal to the right of the main diagonal
        diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
        ### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
        diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
        ### 3. Stack those diagonals into new matrix
        diagonals = np.stack(diagonals)
        ### 4. Apply activation weights to each row. Multiply similarities with our activation.
        diagonals = diagonals * activation_weights.reshape(-1,1)
        ### 5. Calculate the weighted sum of activated similarities
        activated_similarities = np.sum(diagonals, axis=0)
        return activated_similarities

def paragraphise(bigstring):
    sentences = bigstring.split('. ')
    
    embeddings = sentencemodel.encode(sentences, show_progress_bar=False)
    similarities = cosine_similarity(embeddings)
    activated_similarities = activate_similarities(similarities, p_size=5)
    minimas = argrelextrema(activated_similarities, np.less, order=2) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.

    split_points = [each for each in minimas[0]]
    text = ''
    for num,each in enumerate(sentences):
        if num in split_points:
            text+=f'\n\n {each}. '
        else:
            text+=f'{each}. '
    return text

### other functions ### 
def zfill_alternative(x,l=2): return x if len(x) >= l else '0'*(l-len(x))+x

Loading transcription model...
Done.
Loading word embedding model...
Done.


In [7]:
import feedparser
import urllib.request
NewsFeed = feedparser.parse("https://tehnikarechi.studio/api/rss/podcasts/knizhnyy-bazar")
entry = NewsFeed.entries[0]

print(entry.keys())

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'links', 'link', 'summary', 'summary_detail', 'content', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'image', 'itunes_duration', 'subtitle', 'subtitle_detail', 'itunes_episodetype', 'tags', 'itunes_explicit'])


In [7]:
!mkdir knizhnyy-bazar

In [8]:
NewsFeed = feedparser.parse("https://tehnikarechi.studio/api/rss/podcasts/knizhnyy-bazar")

dl_dir = 'knizhnyy-bazar'

failsforsomereason = []

arr = os.listdir('/kaggle/working/'+dl_dir)

for entry in tqdm(NewsFeed.entries):
    notdone = True
    while notdone:
        try:
            vid_title = re.sub(r'[^A-Za-zЁёА-я0-9— ]+', '', entry['title'].replace(u'\xa0', u' '))[:130]

            if (vid_title in [x[:-4] for x in arr] or 
                vid_title in failsforsomereason):
                notdone = False
                continue

            opener = urllib.request.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            urllib.request.install_opener(opener)
            urllib.request.urlretrieve(entry['links'][1]['href'], "audio.mp3")
            print(vid_title)

            #print(yt.streams.filter(only_audio=True).order_by('abr'))

            transcription = model.transcribe("audio.mp3",  language="ru")
            
            prettytext = paragraphise(transcription['text'])

            with open(dl_dir+'/'+vid_title+'.txt','w+',encoding='utf-8') as myfile:
                myfile.write('---')
                myfile.write('\ntitle: '+vid_title)
                myfile.write('\nauthor: '+entry['author'])
                myfile.write('\npublished: '+entry['published'])
                myfile.write('\ntags: '+str([ a['term'] for a in entry['tags']]))
                myfile.write('\n---\n\n')
                myfile.write(prettytext)
            
            print('Done')
            notdone = False
        except Exception as e:
            continue

  0%|          | 0/106 [00:00<?, ?it/s]

Глава в которой Галя и Настя отвечают на ваши вопросы и спойлерят следующий сезон


 54%|█████▍    | 57/106 [2:40:50<2:18:15, 169.30s/it]


KeyboardInterrupt: 

In [12]:
!tar -zcvf knizhnyy-bazar.tar.gz knizhnyy-bazar/*.txt

knigtok/5 февраля начнется новый сезон Книжного базара — о книгах на которых мы все выросли Что перечитать Мы составили список.txt
knigtok/Глава в которой Аэлиту съедает Чужой Пелевин обнуляет космос но всех спасает Илон Маск.txt
knigtok/Глава в которой Бондарчук обыгрывает Толстого на Бородинском поле а Кубрик снимает лучший костюмный фильм в истории.txt
knigtok/Глава в которой Властелин колец становится экоманифестом а Толкин оказывается Гретой Тунберг 1950х.txt
knigtok/Глава в которой Галина Юзефович и Антон Долин возвращаются с сезоном об экранизациях — теперь с видеоверсией И начинают с Шерлока .txt
knigtok/Глава в которой Галина Юзефович советует читать на каникулах детективы а Антон Долин отговаривает смотреть Реальную любовь.txt
knigtok/Глава в которой Гамлет Фотоувеличение и даже Царь Эдип больше похожи на детектив чем сериал Шерлок.txt
knigtok/Глава в которой Гарри Поттер становится Христом и навсегда входит в наш культурный код хотите вы этого или нет.txt
knigtok/Глава в кот

In [13]:
from IPython.display import FileLink
FileLink(r'knizhnyy-bazar.tar.gz')