# Experiment report 
This is a playground where I experimented and tested ideas.

## Loading and preprocessing

In [None]:
# Choose file for the playground  

transcript_files = [
    "2024 Rolls-Royce Spectre Review.csv",
    "Apple Vision Pro Impressions.csv",
    "George Hotz.csv",
    "The END of Sam Bankman Fried.csv",
    "Why is LinkedIn so weird.csv"
]

file = transcript_files[4]

In [None]:
import os
import pandas as pd

folder_path = "data/transcripts"

file_path = os.path.join(folder_path, file)
raw_df = pd.read_csv(file_path)
raw_df.rename(columns={'length': 'time'}, inplace=True)

print(raw_df['sentence'])

In [None]:
import string

def clean_tokenize(text):
    """Removes punctuation, converts to lowercase, and splits into words."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    return words

raw_df['tokens'] = raw_df['sentence'].apply(clean_tokenize)
raw_df['length'] = raw_df['tokens'].apply(len)
print(raw_df.head(10))

In [None]:
from openai import OpenAI


API_KEY = "YOUR_KEY"
MODEL = "text-embedding-ada-002"

client = OpenAI(
  api_key=API_KEY,
)

def get_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model=MODEL
    )
    return response.data[0].embedding

raw_df['embedding'] = raw_df['sentence'].apply(get_embeddings)
print(raw_df['embedding'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

def cosine_distance(embeddings):
    cos_distances = [None]
    for i in range(1, len(embeddings)):
        cos_distance = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
        cos_distances.append(cos_distance)
    return cos_distances

raw_df['cos_dist'] = cosine_distance(raw_df['embedding'].tolist())

plt.figure(figsize=(10, 6))
plt.plot(raw_df['cos_dist'], marker='o', linestyle='-')
plt.xlabel('Sentence Index')
plt.ylabel('Cosine Distance')
plt.title('Cosine Distance Between Consecutive Sentences')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

Here, I want to look at the data. We read texts sequentially, and I was curious to inspect the similarities of embeddings to understand if there's a way to further clean the data. Or maybe discover some insights from it.

From here we see that identical sentences (like "Cringe. Cringe. Cringe.") will complicate processing — I'll try to merge them, but this will obfuscate finding timecodes in data, so I'll have to abandon the idea. The following section, where I restructure the dataset to eliminate similarities, did not make it into the application code.

## Reforming the data

Thought that sentences of 35+ tokens are rather long and could be split. As well as 5- could be merged. Here I go without calculating any stats on the text, just playing around with the data 

In [None]:
long = 35
short = 5

In [None]:
# Merging sentences if they are close semantically anf short 

quantile = raw_df['cos_dist'].quantile(0.8)
close_indices = raw_df.index[raw_df['cos_dist'] > quantile].tolist()
# print(close_indices)

sentences = [raw_df.loc[0, 'sentence']]
times = [raw_df.loc[0, 'time']]

i = 1
while i < len(raw_df):    
    current = raw_df.loc[i, 'sentence']
    current_t = raw_df.loc[i, 'time']
    length = raw_df.loc[i, 'length']

    previous = sentences[-1]
    previous_t = times[-1]
    
    # if it's short and similar, concatenate it 
    if i in close_indices and length <= short:
        sentences[-1] = previous + " " + current
        times[-1] = previous_t + current_t
        
    # if starts with ..., concatenate it
    elif previous.endswith('...') and current.startswith('...'):
        sentences[-1] = previous[:-3] + " " + current[3:]
        times[-1] = previous_t + current_t
    
    # leave it as is if it's fine 
    else:        
        sentences.append(current)
        times.append(current_t)
    i += 1
    
df = pd.DataFrame()
df['sentence'] = sentences
df['time'] = times

print(df[-10:-1])
print("\n")
print(df.info())

In [None]:
# Recalculating embeddings and all the colums for the new dataframe 

df['tokens'] = df['sentence'].apply(clean_tokenize)
df['tempo'] = df['tokens'].apply(len) / df['time']
df['length'] = df['tokens'].apply(len)
df['question'] = df['sentence'].str.contains('\?')
df['embedding'] = df['sentence'].apply(get_embeddings)

df.info()
df.head

In [None]:
print(df['sentence'][16])

In [None]:
print(df['sentence'][77])

In [None]:
# Figuring out start and finish time for every sentence in a new df 

start_times = [0]
end_times = []

for i in range(len(df)):
    if i > 0:
        start_time = start_times[i-1] + df.loc[i-1, 'time']
        start_times.append(start_time)
    end_time = start_times[i] + df.loc[i, 'time']
    end_times.append(end_time)

df['start_time'] = start_times
df['end_time'] = end_times

print(df.head())

In [None]:
print(df.loc[74])

# Sentiment analysis
Figuring out if sentiment analysis can reveal some knowledge about the text. 

Before choosing huggingface, I tried to TextBlob and Vader as something local and lightweight, but was not satisfied with results. I didn't calculate any metrics because I knew I had limited time. So I simply disagreed with these models outputs. In a production setting, I would try to get metrics on some labeled dataset similar to my data.

For now I settled with Roberta.

In [None]:
# from textblob import TextBlob
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax


class SentimentAnalyzer:
    
    def __init__(self):
        self.model_name = "cardiffnlp/twitter-roberta-base-sentiment"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)

    def predict_sentiment(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            logits = self.model(**inputs).logits
        scores = softmax(logits, dim=1)
        scores_dict = {label: score.item() for label, score in zip(['negative', 'neutral', 'positive'], scores[0])}
        return scores_dict
    
    def apply_to_dataframe(self, df, text_column):
        non_neutrals, positives, negatives = [], [], []
        
        for text in df[text_column]:
            sentiment_scores = self.predict_sentiment(text)
            non_neutrals.append(1 - sentiment_scores['neutral'])
            positives.append(sentiment_scores['positive'])
            negatives.append(sentiment_scores['negative'])
            
        df['emotion_score'] = non_neutrals
        df['positive_score'] = positives
        df['negative_score'] = negatives


analyzer = SentimentAnalyzer()
analyzer.apply_to_dataframe(df, 'sentence')
print(df.head())

In [None]:
df['emotion_score']

## Exploring the data

In [None]:
mean_tempo = df['tempo'].mean()

In [None]:
def plot_sentiment_and_tempo(df):
    plt.figure(figsize=(14, 8))
    
    plt.plot(df.index, df['emotion_score'], color='red', label='Roberta')
    
    plt.plot(df.index, df['tempo'], color='green', label='Tempo')
    plt.axhline(y=mean_tempo, color='green', linestyle='-', label='Mean tempo')

    plt.xlabel('Sentence Number')
    plt.ylabel('Values')
    plt.title('Polarity, Subjectivity, and Tempo across Sentences')
    plt.legend()
    
    for index, row in df.iterrows():
        if row['question']:
            plt.axvline(x=index, color='blue', label='Question Mark', linestyle='--')
    
    plt.grid(axis='x', linestyle='--')
    plt.xticks(df.index[::2])
    plt.tight_layout()

    plt.show()
    
plot_sentiment_and_tempo(df)

Blue vertical lines are question marks. I wanted to see if parameters correspond to each other, not seriously, just visually. NB: I called 'emotion_score' all probably non-neutral (1 - neutral_score) sentences. This way I dont' care about the actual tone of the statement, rather see that it stands out.

### Non-neutral statements 

In [None]:
top_non_neutral_indices = sorted(df['emotion_score'].abs().nlargest(8).index.tolist())
print("Top 8 RoBERTa Non-Neutral Sentences:", top_non_neutral_indices)

print("\n")
for index in top_non_neutral_indices:
    print(f"{index}: {df.loc[index, 'sentence']}")

### Non-neutral questions

In [None]:
emotional_threshold = df['emotion_score'].quantile(0.5)  # The upper part of the distribution

questions = df[(df['question']) & (df['emotion_score'] > emotional_threshold)]

for index, row in questions.iterrows():
    print(f"{index}: {row['sentence']}")

### Non-neutral tone & fast 

In [None]:
fastest = df['tempo'].quantile(0.75)

fastest_emotional = df[(df['emotion_score'] > emotional_threshold) & (df['tempo'] > fastest)]

for index, row in questions.iterrows():
    print(f"{index}: {row['sentence']}")

Interestingly, all questions

In [None]:
top_5_slowest = df.sort_values(by='tempo', ascending=True).head(5).index.tolist()

# Thinking that pauses might emphasize previous statements and being careless with indexation
for i in top_5_slowest:
    print(f"{i - 2}: {df.loc[i - 2, 'sentence']}")
    print(f"{i - 1}: {df.loc[i - 1, 'sentence']}")
    print(f"{i}: {df.loc[i, 'sentence']}\n")

In [None]:
df.head()

## Semantic connections

In [None]:
# Do questions have answers?
import numpy as np

def find_closest_statements(df):
    questions_df = df[df['question'] == True]
    statements_df = df[df['question'] == False]
    
    question_embeddings = list(questions_df['embedding'])
    statement_embeddings = list(statements_df['embedding'])
    
    closest_statements = {}
    
    for index, question_embedding in questions_df.iterrows():
        similarities = cosine_similarity([question_embedding['embedding']], statement_embeddings)
        top_5_indices = similarities[0].argsort()[-3:][::-1]
        
#         top_5_indices = np.append(np.sort(top_5_indices[1:]), top_5_indices[0])
        
        closest_sentences = statements_df.iloc[top_5_indices]['sentence'].values
        closest_statements[question_embedding['sentence']] = closest_sentences
    
    return closest_statements

closest_statements = find_closest_statements(df)
for question, statements in closest_statements.items():
    text = ' '.join(statements)
    print(f"Question: {question}\n{text}\n")

In [None]:
emotion_threshold=0.5
emotional_indexes = df[df['emotion_score'] > emotion_threshold].index
print(emotional_indexes.tolist())

In [None]:
# Looking for intros in the text as they might form nice openin of the video

def find_intros(df):
    similarity_threshold=0.765  # added ad-hoc threshold, very sorry
    
    request = "My name is Ankit Singla and I'm a full-time blogger. I blog about blogging. I'm Karen, an entrepreneur and VC consultant. Paul Erdős was a Hungarian mathematician. He was one of the most prolific mathematicians and producers of mathematical conjectures of the 20th century. This is Maria and she is an ML Engineer at Rask"
    request_embedding = get_embeddings(request)
    request_embedding = np.array(request_embedding).reshape(1, -1)

    sentence_similarities = []
    for index, row in df.iterrows():
        embedding = np.array(row['embedding']).reshape(1, -1)
        similarity = cosine_similarity(embedding, request_embedding)[0][0]
#         sentence_similarities.append((index, row['sentence'], similarity))
        if similarity > similarity_threshold:
            sentence_similarities.append((index, row['sentence'], similarity))

    sorted_sentences = sorted(sentence_similarities, key=lambda x: x[2], reverse=True)
    print(sorted_sentences)
    return [{i: sentence} for i, sentence, _ in sorted_sentences]

intros = find_intros(df)
print(intros)

# Clustering

See if clustering works for this task. Looks inconclusive 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

embeddings_array = np.array(list(df['embedding']))

# Getting the optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings_array)
    score = silhouette_score(embeddings_array, labels)
    silhouette_scores.append(score)

optimal_clusters = range(2, 11)[silhouette_scores.index(max(silhouette_scores))]

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings_array)

for cluster in sorted(df['cluster'].unique()):
    sentences = df[df['cluster'] == cluster].sort_index()['sentence']
    for sentence in sentences:
        print(f"{sentence}")
    print("\n")

# Breaking into paragraphs 
Instead of thinking about breaking the text into unrelated and out-of-context thematic clusters, it's better to assume that in practice texts already come with a structure. Therefore, it should be divided into paragraphs, even if the division turns out to be imprecise.

The matrix lookes different on raw data, remember these are not original sentences.

In [None]:
import seaborn as sns

embeddings_matrix = np.array(df['embedding'].tolist())
cosine_sim_matrix = cosine_similarity(embeddings_matrix)

sns.heatmap(cosine_sim_matrix).set_title('Cosine similarities matrix');

In [None]:
# Looking for split points

import math
from scipy.signal import argrelextrema

def rev_sigmoid(x: float) -> float:
    return 1 / (1 + math.exp(0.5 * x))

def activate_similarities(similarities: np.array, p_size=10) -> np.array:
    x = np.linspace(-10, 10, p_size)
    y = np.vectorize(rev_sigmoid)
    activation_weights = np.pad(y(x), (0, similarities.shape[0] - p_size), 'constant')
    diagonals = [similarities.diagonal(each) for each in range(1, similarities.shape[0])]
    diagonals = [np.pad(each, (0, similarities.shape[0] - len(each)), 'constant') for each in diagonals]
    diagonals = np.stack(diagonals)
    diagonals = diagonals * activation_weights[:diagonals.shape[0]].reshape(-1, 1)
    activated_similarities = np.sum(diagonals, axis=0)
    return activated_similarities

activated_similarities = activate_similarities(cosine_sim_matrix, p_size=10)

fig, ax = plt.subplots()
minimas = argrelextrema(activated_similarities, np.less, order=2)
sns.lineplot(y=activated_similarities, x=range(len(activated_similarities)), ax=ax).set_title('Relative minima')
plt.vlines(x=minimas, ymin=min(activated_similarities), ymax=max(activated_similarities), colors='purple', ls='--', lw=2, label='Split Points')
plt.legend()
plt.show()

In [None]:
split_points = [each for each in minimas[0]]
text = ''
for num, each in enumerate(df['sentence']):
    if num in split_points:
        text += f'\n\n{each} '
    else:
        text += f'{each} '
        
print(text)

# Summarization
Trying to summarize the text to see if something could be built upon the summarization. I tried several approaches (like LSA), but settled on TextRank. Also, while searching, found a funny and compact library called sumy.

### Extractive summarization

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer

def summarize_with_textrank(text, sentences_count=10):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    text_rank_summarizer = TextRankSummarizer()
    text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count)
    summary_text = "\n".join(str(sentence) for sentence in text_rank_summary)
    
    return summary_text

In [None]:
summary = summarize_with_textrank(text, 3)  # top 3
print(summary)

### Clustering and extracting sub-themes 

In [None]:
from sumy.utils import get_stop_words
from collections import Counter


stop_words = set(get_stop_words('ENGLISH'))  # very nice stop words collection


def get_text_theme_keywords(sentences, embeddings, num_clusters=3):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(embeddings)
    cluster_labels = kmeans.labels_
    
    # Getting closest sentences
    cluster_sentences = [[] for _ in range(num_clusters)]
    for i, sentence in enumerate(sentences):
        cluster_sentences[cluster_labels[i]].append(sentence)
    
    # Getting most used tokens, excluding stop-words
    cluster_keywords = []
    for cluster in cluster_sentences:
        cluster_text = ' '.join(cluster)
        cluster_words = clean_tokenize(cluster_text)
        cluster_words = [word for word in cluster_words if word not in stop_words]
        word_counts = Counter(cluster_words)
        most_common_words = word_counts.most_common(3)
        cluster_keywords.append([word[0] for word in most_common_words])
    
    return cluster_keywords

text_theme_keywords = get_text_theme_keywords(df['sentence'].tolist(), df['embedding'].tolist())
print("Text keywords:", text_theme_keywords)

Couldn't derive subthemes, but worked **amazing** for the whole text (num_clusters=1)

## Enrichment from YouTube 
Let's see if I can extract something meaningful from the web

In [None]:
from googleapiclient.discovery import build

YOUTUBE_API_KEY = 'YOUR_KEY'
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
video_id = 'IMfBS4mBfBQ'


def get_comments(video_id):
    comments = []
    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        textFormat='plainText',
        maxResults=100,
    )
    response = request.execute()

    for item in response['items']:
        comment = {
            "id": item['snippet']['topLevelComment']['id'],
            "text": item['snippet']['topLevelComment']['snippet']['textDisplay'],
            "likes": item['snippet']['topLevelComment']['snippet']['likeCount']
        }
        comments.append(comment)
        
    return sorted(comments, key=lambda item: item["likes"], reverse=True)


def get_channel_id(video_id):
    request = youtube.videos().list(
        part='snippet',
        id=video_id
    )
    response = request.execute()

    if 'items' in response and response['items']:
        channel_id = response['items'][0]['snippet']['channelId']
        return channel_id
    else:
        return None


def get_channel_description(channel_id):
    request = youtube.channels().list(
        part='snippet',
        id=channel_id
    )
    response = request.execute()

    if 'items' in response and response['items']:
        description = response['items'][0]['snippet']['description']
        return description
    else:
        return None
    
    
def get_channel_videos_descriptions(channel_id):
    video_descriptions = []
    request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,  
        order="date"
    )
    response = request.execute()

    for item in response['items']:
        if item['id']['kind'] == "youtube#video":
            video_description = {
                "title": item['snippet']['title'],
                "description": item['snippet']['description']
            }
            video_descriptions.append(video_description)

    return video_descriptions

channel_id = get_channel_id(video_id)
comments = get_comments(video_id)
description = get_channel_description(channel_id)
video_descriptions = get_channel_videos_descriptions(channel_id)

In [None]:
comments_df = pd.DataFrame(comments)
comments_df.columns = ['id', 'comment', 'likes']
comments_df.head()

In [None]:
videos_df = pd.DataFrame(video_descriptions)
videos_df.columns = ['title', 'description']
videos_df.head()

In [None]:
from pytube import YouTube

path = "./"

def download_video(video_id, save_path=path):
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(video_url)
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    if stream:
        stream.download(output_path=save_path, filename=video_id + '.mp4')
        print(f'Video {video_id} has been downloaded successfully.')
    else:
        print('No suitable stream found for downloading.')


download_video(video_id=video_id)

Here, I was already thinking about the structure of the video I aim to create. I searched the channel to see if there are any videos containing the _author's_ intro. It would make sense to include them into a final resut. I found such videos, but they were intros of other people, not the blogger himself, and I couldn't automatically distinguish them, so I had to give up the idea.

In [None]:
def find_channel_intro(channel_id):

    # Videos from channels 
    request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,
        order="date",
        type="video"
    )
    response = request.execute()
    
    # Looking for videos with intros
    for item in response.get('items', []):
        title = item['snippet']['title'].lower()
        description = item['snippet']['description'].lower()
        
        # Keywords for search 
        keywords = ['intro', 'introduction', 'about', 'welcome', 'hello', 'hi']
        
        if any(keyword in title for keyword in keywords) or any(keyword in description for keyword in keywords):
            video_id = item['id']['videoId']
            video_url = f'https://www.youtube.com/watch?v={video_id}'
            print(f'Found potential intro video: {video_url}')
            return video_url
    
    print('Intro video not found.')
    return None


find_channel_intro(channel_id=channel_id)

I once took a course on how to make reels on Instagram. The video structure in that course was actually based on text (I never thought that experience would come in handy). The suggested structure was as follows:

- Title
- Hook
- Intro
- Core
- Conclusion

An example of such a video (you've definitely come across them online):
- Title: All the best business books are actually about the same thing
- Hook: Yes, you're about to hear one key idea that is mentioned in all the top business literature
- Intro: I'm Alex, this is a blog about money on Instagram, subscribe!
- Core: So, the idea. Business is not the company's logo, office, or business cards. And it's not even a team of employees, a strong product, or followers on social media. Business is when you get paid. THAT'S IT.
- Conclusion: If there's an incoming flow of money, you have a business. If not, then not yet.

I decided to try to assemble something similar.

Approaches:
1. Highlights-based: I can take some emotional moment from the video and build my extract around it.
2. Title-based: I can look for what the video is about and rely on a sentence with the video's theme.
3. Question-based: I can look for question-answer pairs.
4. Intro-based: I can look for intros on the channel. As I found out, these can be not only the author's intros, but why not.
4. I can get the most commented parts of the video (possible, but not implemented)

Let's try!

## Experimenting with GPT 

In [None]:
def prompt_gpt(model="text-davinci-003", temperature=0.7, max_tokens=150):
    
    prompt = ""

    response = openai.Completion.create(
        engine=model,
        prompt=prompt_text,
        temperature=temperature,
        max_tokens=max_tokens
    )

    generated_text = response.choices[0].text.strip()
    return generated_text

## Validation

Decided to validate the output on the same model

## Cutting and editing final video

In [None]:
from moviepy.editor import VideoFileClip, concatenate_videoclips


def cut_sentences_from_video(path, df, sentence_numbers):
    video = VideoFileClip(path)
    clips = []

    for number in sentence_numbers:
        start_time = df.loc[df.index == number, 'start_time'].values[0]
        end_time = df.loc[df.index == number, 'end_time'].values[0]
        clip = video.subclip(start_time, end_time)
        clips.append(clip)

    final_clip = concatenate_videoclips(clips)
    final_clip_path = 'output_video.mp4'
    final_clip.write_videofile(final_clip_path, codec="libx264", fps=24)


cut_sentences_from_video('/Users/mariachakchurina/projects/video_transcript_analysis/IMfBS4mBfBQ.mp4', df, [49, 50, 51, 54, 55])

### Seeing final text 

In [None]:
selected = [6, 11, 36, 37, 53, 63, 64, 80]
generated_text = ' '.join(raw_df.loc[selected, 'sentence'])
print(generated_text)