# Предобработка

In [1]:
import os
import random
import csv
import string
import numpy as np

import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import seaborn as sns

import math
from scipy.signal import argrelextrema

from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
API_KEY = "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"
MODEL = "text-embedding-ada-002"

transcript_files = [
    "2024 Rolls-Royce Spectre Review.csv",
    "Apple Vision Pro Impressions.csv",
    "George Hotz.csv",
    "The END of Sam Bankman Fried.csv",
    "Why is LinkedIn so weird.csv"
]

folder_path = "data/transcripts"

file_path = os.path.join(folder_path, transcript_files[4])
raw_df = pd.read_csv(file_path)
raw_df.rename(columns={'length': 'time'}, inplace=True)

print(raw_df['sentence'])

0       In some ways, the point of LinkedIn is obvious.
1     It's not like Instagram, where you're supposed...
2     It's not like Twitter, where you're supposed t...
3     And it's not like Facebook, where you're suppo...
4        LinkedIn, however, is where you go to network.
                            ...                        
81    You know, Dan, I have to say I've been so incr...
82                   I think you do have what it takes.
83                                    Julie, thank you.
84                           Thank you for saying that.
85    I'll pay you the $15 I promised you for saying...
Name: sentence, Length: 86, dtype: object


In [None]:
def clean_tokenize(text):
    """Removes punctuation, converts to lowercase, and splits into words."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    return words

raw_df['tokens'] = raw_df['sentence'].apply(clean_tokenize)
raw_df['length'] = raw_df['tokens'].apply(len)
print(raw_df.head(10))

In [None]:
client = OpenAI(
  api_key=API_KEY,  # todo https://github.com/openai/openai-python/discussions/742 os.environ['OPENAI_API_KEY']
)

def get_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model=MODEL
    )
    return response.data[0].embedding

raw_df['embedding'] = raw_df['sentence'].apply(get_embeddings)
print(raw_df['embedding'])

In [None]:
def cosine_distance(embeddings):
    cos_distances = [None]
    for i in range(1, len(embeddings)):
        cos_distance = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]  # todo np dot 
        cos_distances.append(cos_distance)
    return cos_distances

raw_df['cos_dist'] = cosine_distance(raw_df['embedding'].tolist())

plt.figure(figsize=(10, 6))
plt.plot(raw_df['cos_dist'], marker='o', linestyle='-')
plt.xlabel('Sentence Index')
plt.ylabel('Cosine Distance')
plt.title('Cosine Distance Between Consecutive Sentences')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
long = 35
short = 5

In [None]:
quantile = raw_df['cos_dist'].quantile(0.8)
close_indices = raw_df.index[raw_df['cos_dist'] > quantile].tolist()
# print(close_indices)

sentences = [raw_df.loc[0, 'sentence']]
times = [raw_df.loc[0, 'time']]

i = 1
while i < len(raw_df):    
    current = raw_df.loc[i, 'sentence']
    current_t = raw_df.loc[i, 'time']
    length = raw_df.loc[i, 'length']

    previous = sentences[-1]
    previous_t = times[-1]
    
    # if it's short and similar, concatenate it 
    if i in close_indices and length <= short:
        sentences[-1] = previous + " " + current
        times[-1] = previous_t + current_t
        
    # if starts with ..., concatenate it
    elif previous.endswith('...') and current.startswith('...'):
        sentences[-1] = previous[:-3] + " " + current[3:]
        times[-1] = previous_t + current_t
    
    # leave it as is if it's fine 
    else:        
        sentences.append(current)
        times.append(current_t)
    i += 1
    
df = pd.DataFrame()
df['sentence'] = sentences
df['time'] = times

print(df[-10:-1])
print("\n")
print(df.info())

In [None]:
df['tokens'] = df['sentence'].apply(clean_tokenize)
df['tempo'] = df['tokens'].apply(len) / df['time']
df['length'] = df['tokens'].apply(len)
df['question'] = df['sentence'].str.contains('\?')
df['embedding'] = df['sentence'].apply(get_embeddings)

df.info()
df.head

In [None]:
print(raw_df['sentence'][49])

In [None]:
# Добавляем колонки с временем начала и окончания предложений

start_times = [0]
end_times = []

for i in range(len(df)):
    if i > 0:
        start_time = start_times[i-1] + df.loc[i-1, 'time']
        start_times.append(start_time)
    end_time = start_times[i] + df.loc[i, 'time']
    end_times.append(end_time)

df['start_time'] = start_times
df['end_time'] = end_times

print(df.head())

In [None]:
print(df.loc[74])

# Sentiment analysis

In [None]:
class SentimentAnalyzer:
    
    def __init__(self):
        self.model_name = "cardiffnlp/twitter-roberta-base-sentiment"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)

    def predict_sentiment(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            logits = self.model(**inputs).logits
        scores = softmax(logits, dim=1)
        scores_dict = {label: score.item() for label, score in zip(['negative', 'neutral', 'positive'], scores[0])}
        return scores_dict
    
    def apply_to_dataframe(self, df, text_column):
        non_neutrals, positives, negatives = [], [], []
        
        for text in df[text_column]:
            sentiment_scores = self.predict_sentiment(text)
            non_neutrals.append(1 - sentiment_scores['neutral'])
            positives.append(sentiment_scores['positive'])
            negatives.append(sentiment_scores['negative'])
            
        df['emotion_score'] = non_neutrals
        df['positive_score'] = positives
        df['negative_score'] = negatives


analyzer = SentimentAnalyzer()
analyzer.apply_to_dataframe(df, 'sentence')
print(df.head())

In [None]:
df['emotion_score']

In [None]:
mean_tempo = df['tempo'].mean()

In [None]:
def plot_sentiment_and_tempo(df):
    plt.figure(figsize=(14, 8))
    
    plt.plot(df.index, df['emotion_score'], color='red', label='Roberta')
    
    plt.plot(df.index, df['tempo'], color='green', label='Tempo')
    plt.axhline(y=mean_tempo, color='green', linestyle='-', label='Mean tempo')

    plt.xlabel('Sentence Number')
    plt.ylabel('Values')
    plt.title('Polarity, Subjectivity, and Tempo across Sentences')
    plt.legend()
    
    for index, row in df.iterrows():
        if row['question']:
            plt.axvline(x=index, color='blue', label='Question Mark', linestyle='--')
    
    plt.grid(axis='x', linestyle='--')
    plt.xticks(df.index[::2])
    plt.tight_layout()

    plt.show()
    
plot_sentiment_and_tempo(df)

In [None]:
top_non_neutral_indices = sorted(df['emotion_score'].abs().nlargest(8).index.tolist())
print("Top 8 RoBERTa Non-Neutral Sentences:", top_non_neutral_indices)

print("\n")
for index in top_non_neutral_indices:
    print(f"{index}: {df.loc[index, 'sentence']}")

In [None]:
emotional_threshold = df['emotion_score'].quantile(0.5)  # Подумай над цифрой 

questions = df[(df['question']) & (df['emotion_score'] > emotional_threshold)]  # todo quartile подумай над метрикой — эта или другая? 

for index, row in questions.iterrows():
    print(f"{index}: {row['sentence']}")

In [None]:
fastest = df['tempo'].quantile(0.75)

fastest_emotional = df[(df['emotion_score'] > emotional_threshold) & (df['tempo'] > fastest)]

for index, row in questions.iterrows():
    print(f"{index}: {row['sentence']}")

In [None]:
top_5_slowest = df.sort_values(by='tempo', ascending=True).head(5).index.tolist()  # todo некрасиво 

# todo: внимательно с индексами, свалится 
# todo: пауза неплохо подчеркивает эмоции, можно выбрать что-то отсюда 
for i in top_5_slowest:
    print(f"{i - 2}: {df.loc[i - 2, 'sentence']}")
    print(f"{i - 1}: {df.loc[i - 1, 'sentence']}")
    print(f"{i}: {df.loc[i, 'sentence']}\n")

In [None]:
df.head()

In [None]:
def find_closest_statements(df):
    questions_df = df[df['question'] == True]
    statements_df = df[df['question'] == False]
    
    question_embeddings = list(questions_df['embedding'])
    statement_embeddings = list(statements_df['embedding'])
    
    closest_statements = {}
    
    for index, question_embedding in questions_df.iterrows():
        similarities = cosine_similarity([question_embedding['embedding']], statement_embeddings)
        top_5_indices = similarities[0].argsort()[-3:][::-1]  # Получаем индексы самых похожих
        
#         top_5_indices = np.append(np.sort(top_5_indices[1:]), top_5_indices[0])
        
        closest_sentences = statements_df.iloc[top_5_indices]['sentence'].values
        closest_statements[question_embedding['sentence']] = closest_sentences
    
    return closest_statements

closest_statements = find_closest_statements(df)
for question, statements in closest_statements.items():
    text = ' '.join(statements)
    print(f"Question: {question}\n{text}\n")

In [None]:
emotion_threshold=0.5
emotional_indexes = df[df['emotion_score'] > emotion_threshold].index
emotional_indexes.tolist()

In [None]:
def find_intros(df):
    similarity_threshold=0.765  # todo ad-hoc threshold, very sorry
    
    request = "My name is Ankit Singla and I'm a full-time blogger. I blog about blogging. I'm Karen, an entrepreneur and VC consultant. Paul Erdős was a Hungarian mathematician. He was one of the most prolific mathematicians and producers of mathematical conjectures of the 20th century. This is Maria and she is a Data Engineer at Rask"
    request_embedding = get_embeddings(request)
    request_embedding = np.array(request_embedding).reshape(1, -1)  # Подготавливаем вектор запроса

    sentence_similarities = []
    for index, row in df.iterrows():
        embedding = np.array(row['embedding']).reshape(1, -1)  # Подготавливаем вектор предложения
        similarity = cosine_similarity(embedding, request_embedding)[0][0]
#         sentence_similarities.append((index, row['sentence'], similarity))
        if similarity > similarity_threshold:
            sentence_similarities.append((index, row['sentence'], similarity))

    sorted_sentences = sorted(sentence_similarities, key=lambda x: x[2], reverse=True)
    print(sorted_sentences)
    return [{i: sentence} for i, sentence, _ in sorted_sentences]

intros = find_intros(df)
print(intros)

# Кластеризация

todo: Кластеризация не дает результатов 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

embeddings_array = np.array(list(df['embedding']))

# Getting the optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings_array)
    score = silhouette_score(embeddings_array, labels)
    silhouette_scores.append(score)

optimal_clusters = range(2, 11)[silhouette_scores.index(max(silhouette_scores))]

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings_array)

for cluster in sorted(df['cluster'].unique()):
    sentences = df[df['cluster'] == cluster].sort_index()['sentence']
    for sentence in sentences:
        print(f"{sentence}")
    print("\n")

# Разбиение на абзацы

In [None]:
embeddings_matrix = np.array(df['embedding'].tolist())
cosine_sim_matrix = cosine_similarity(embeddings_matrix)

# cosine_sim_matrix теперь содержит косинусное сходство между всеми парами эмбеддингов
sns.heatmap(cosine_sim_matrix).set_title('Cosine similarities matrix');

In [None]:
# Поиск точек разбиения 

def rev_sigmoid(x: float) -> float:
    return 1 / (1 + math.exp(0.5 * x))

def activate_similarities(similarities: np.array, p_size=10) -> np.array:
    x = np.linspace(-10, 10, p_size)
    y = np.vectorize(rev_sigmoid)
    activation_weights = np.pad(y(x), (0, similarities.shape[0] - p_size), 'constant')
    diagonals = [similarities.diagonal(each) for each in range(1, similarities.shape[0])]
    diagonals = [np.pad(each, (0, similarities.shape[0] - len(each)), 'constant') for each in diagonals]
    diagonals = np.stack(diagonals)
    diagonals = diagonals * activation_weights[:diagonals.shape[0]].reshape(-1, 1)
    activated_similarities = np.sum(diagonals, axis=0)
    return activated_similarities

activated_similarities = activate_similarities(cosine_sim_matrix, p_size=10)

fig, ax = plt.subplots()
minimas = argrelextrema(activated_similarities, np.less, order=2)
sns.lineplot(y=activated_similarities, x=range(len(activated_similarities)), ax=ax).set_title('Relative minima')
plt.vlines(x=minimas, ymin=min(activated_similarities), ymax=max(activated_similarities), colors='purple', ls='--', lw=2, label='Split Points')
plt.legend()
plt.show()

In [None]:
split_points = [each for each in minimas[0]]
text = ''
for num, each in enumerate(df['sentence']):
    if num in split_points:
        text += f'\n\n{each} '
    else:
        text += f'{each} '
        
print(text)

# Summarization 

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer

def summarize_with_textrank(text, sentences_count=10):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Used TextRank for summarization
    text_rank_summarizer = TextRankSummarizer()
    text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count)
    summary_text = "\n".join(str(sentence) for sentence in text_rank_summary)
    
    return summary_text

In [None]:
summary = summarize_with_textrank(text, 3)  # todo: also tried LSA 
print(summary)

In [None]:
from sumy.utils import get_stop_words
from collections import Counter


stop_words = set(get_stop_words('ENGLISH'))  # todo remove 

# todo refactor 
def get_text_theme_keywords(sentences, embeddings, num_clusters=1):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(embeddings)
    cluster_labels = kmeans.labels_
    
    # Собираем предложения для каждого кластера
    cluster_sentences = [[] for _ in range(num_clusters)]
    for i, sentence in enumerate(sentences):
        cluster_sentences[cluster_labels[i]].append(sentence)
    
    # Для каждого кластера выбираем наиболее часто встречающиеся слова, исключая стоп-слова
    cluster_keywords = []
    for cluster in cluster_sentences:
        cluster_text = ' '.join(cluster)
        cluster_words = clean_tokenize(cluster_text)
        cluster_words = [word for word in cluster_words if word not in stop_words]
        word_counts = Counter(cluster_words)
        most_common_words = word_counts.most_common(3)
        cluster_keywords.append([word[0] for word in most_common_words])
    
    return cluster_keywords

text_theme_keywords = get_text_theme_keywords(df['sentence'].tolist(), df['embedding'].tolist())
print("Ключевые слова темы текста:", text_theme_keywords)

# Обогащение с YouTube 

In [None]:
from googleapiclient.discovery import build

# todo: reconfig

YOUTUBE_API_KEY = 'AIzaSyAAD5vikUMvBwj1xUyDW4YyGKneIQvdk_U'
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
video_id = 'IMfBS4mBfBQ'


def get_comments(video_id):
    comments = []
    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        textFormat='plainText',
        maxResults=100,
    )
    response = request.execute()

    for item in response['items']:
        comment = {
            "id": item['snippet']['topLevelComment']['id'],
            "text": item['snippet']['topLevelComment']['snippet']['textDisplay'],
            "likes": item['snippet']['topLevelComment']['snippet']['likeCount']
        }
        comments.append(comment)
        
    return sorted(comments, key=lambda item: item["likes"], reverse=True)


def get_channel_id(video_id):
    request = youtube.videos().list(
        part='snippet',
        id=video_id
    )
    response = request.execute()

    if 'items' in response and response['items']:
        channel_id = response['items'][0]['snippet']['channelId']
        return channel_id
    else:
        return None


def get_channel_description(channel_id):
    request = youtube.channels().list(
        part='snippet',
        id=channel_id
    )
    response = request.execute()

    if 'items' in response and response['items']:
        description = response['items'][0]['snippet']['description']
        return description
    else:
        return None
    
    
def get_channel_videos_descriptions(channel_id):
    video_descriptions = []
    request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,  
        order="date"
    )
    response = request.execute()

    for item in response['items']:
        if item['id']['kind'] == "youtube#video":
            video_description = {
                "title": item['snippet']['title'],
                "description": item['snippet']['description']
            }
            video_descriptions.append(video_description)

    return video_descriptions

channel_id = get_channel_id(video_id)
comments = get_comments(video_id)
description = get_channel_description(channel_id)
video_descriptions = get_channel_videos_descriptions(channel_id)

In [None]:
comments_df = pd.DataFrame(comments)
comments_df.columns = ['id', 'comment', 'likes']
comments_df.head()

In [None]:
videos_df = pd.DataFrame(video_descriptions)
videos_df.columns = ['title', 'description']
videos_df.head()

In [None]:
from pytube import YouTube

path = "./"  # todo 

def download_video(video_id, save_path=path):
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(video_url)
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    if stream:
        stream.download(output_path=save_path, filename=video_id + '.mp4')
        print(f'Video {video_id} has been downloaded successfully.')
    else:
        print('No suitable stream found for downloading.')


download_video(video_id=video_id)

In [None]:
def find_channel_intro(channel_id):

    # Получаю список видео канала
    request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,
        order="date",
        type="video"
    )
    response = request.execute()
    
    # Поиск видео, в которых может быть интро 
    for item in response.get('items', []):
        title = item['snippet']['title'].lower()
        description = item['snippet']['description'].lower()
        
        # Ключевые слова для поиска в заголовках и описаниях
        keywords = ['intro', 'introduction', 'about', 'welcome', 'начало', 'приветствие']
        
        # Проверяем наличие ключевых слов в заголовках и описаниях
        if any(keyword in title for keyword in keywords) or any(keyword in description for keyword in keywords):
            video_id = item['id']['videoId']
            video_url = f'https://www.youtube.com/watch?v={video_id}'
            print(f'Found potential intro video: {video_url}')
            return video_url
    
    print('Intro video not found.')
    return None


find_channel_intro(channel_id=channel_id)

# Forming sequence 

Структура
- Заголовок
- Хук 
- Интро
- Кода 
- Вывод

Что это значит 
- Заголовок: все лучшие книги по бизнесу на самом деле про одно и то же 
- Хук: да, сейчас ты услышишь одну ключевую мысль, о которой говорится во всей топовой деловой литературе -- строчка, задача которой -- удержать внимание 
- Интро: Я Артем, здесь про деньги в инстаграм, подписывайся 
- Кода: итак, мысль. Бизнес -- это не логотип компании, офис или визитные карточки. И это даже не команда сотрудников, сильный продукт или подписчики в соцсетях. Бизнес -- это когда тебе платят. ВСЁ
- Вывод: если есть входящий поток денег -- у тебя бизнес, если нет, то пока еще нет

Подходы 
1. Хайлайтс-бейсд 
2. Заголовок-бейсд
3. Вопрос-бейсд 
4. Интро-бейсд 

In [None]:
def prompt_gpt(model="text-davinci-003", temperature=0.7, max_tokens=150):
    
    prompt = ""

    response = openai.Completion.create(
        engine=model,
        prompt=prompt_text,
        temperature=temperature,
        max_tokens=max_tokens
    )

    generated_text = response.choices[0].text.strip()
    return generated_text

# Валидация 

todo: на порождающей модели

# Нарезка и склейка 

In [None]:
from moviepy.editor import VideoFileClip, concatenate_videoclips


def cut_sentences_from_video(path, df, sentence_numbers):
    video = VideoFileClip(path)
    clips = []

    for number in sentence_numbers:
        start_time = df.loc[df.index == number, 'start_time'].values[0]
        end_time = df.loc[df.index == number, 'end_time'].values[0]
        clip = video.subclip(start_time, end_time)
        clips.append(clip)

    final_clip = concatenate_videoclips(clips)
    final_clip_path = 'output_video.mp4'
    final_clip.write_videofile(final_clip_path, codec="libx264", fps=24)


cut_sentences_from_video('/Users/mariachakchurina/projects/video_transcript_analysis/IMfBS4mBfBQ.mp4', df, [49, 50, 51, 54, 55])

In [6]:
selected = [6, 11, 36, 37, 53, 63, 64, 80]
generated_text = ' '.join(raw_df.loc[selected, 'sentence'])
print(generated_text)

So why the hell is it so weird? At what age should your child start dropshipping? So why in the hell would anyone want to go viral in this sea of cringe? You know, it's so important today for C-level executives to have a presence on LinkedIn, right? And what kind of quality content does a LinkedIn non-influencer put out? So if this is the state of LinkedIn now, where is it headed? I mean, I think it's going to continue to grow in importance as, you know, we become a global economy, right? From this conversation, do you think I have what it takes to be a thought leader?
