In [1]:
# Install the required libraries
%pip install youtube-transcript-api
%pip install openai==0.28

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2
Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [6]:
import openai
from getpass import getpass
from youtube_transcript_api import YouTubeTranscriptApi


api_key = getpass("API: ")
openai.api_key = api_key

def get_video_id(url):
    # Extracts video ID from YouTube URL.
    from urllib.parse import urlparse, parse_qs
    query = urlparse(url).query
    video_id = parse_qs(query).get('v')
    return video_id[0] if video_id else None

def fetch_transcript(video_id):
    # Fetches the YouTube video transcript.
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = ' '.join([item['text'] for item in transcript_list])
        return transcript_text if transcript_text else None
    except Exception as e:
        raise Exception(f"Failed to fetch transcript: {e}")

def summarize_text(text):
    # Uses OpenAI's GPT model to summarize the text.
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": "Provide a comprehensive summary of this transcript."},
                  {"role": "user", "content": text}],
        max_tokens=600
    )
    return response['choices'][0]['message']['content']

if __name__ == "__main__":
    video_url = input("Youtube: ")
    video_id = get_video_id(video_url)
    if video_id:
        transcript = fetch_transcript(video_id)
        if transcript:
            print("Able to fetch.")
            summary = summarize_text(transcript)
            print("Summary:", summary)
        else:
            print("Failed to fetch.")
    else:
        print("Invalid YouTube URL")

API: ··········
Youtube: https://www.youtube.com/watch?v=jKyGrkEpMlM&t=3s
Able to fetch.
Summary: The speaker discusses the importance of Gestalt theory in art and design, emphasizing that understanding just Gestalt Theory allows one to suggest ideas without drawing them, which is key to achieving artistic mastery. They explain the elements of art and principles of design (such as line, space, color, shape, balance, contrast, movement, etc.) to provide foundational knowledge. Gestalt theory is presented as the missing context that complements the elements and principles of art, shaping how we perceive and make art. The speaker highlights key Gestalt principles like proximity, similarity, symmetry, continuation, closure, common fate, past experience, common region, and element connectedness, illustrating how these concepts impact the grouping and perception of visual elements. Examples and explanations are provided to help viewers understand how these Gestalt principles influence compos

In [7]:
!pip install google



In [11]:
import nltk
import requests
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from googlesearch import search
import numpy as np

# Download NLTK resources
nltk.download('punkt')  # Punkt Tokenizer Model for English.
nltk.download('stopwords')  # Common words that generally have little lexical content.

def get_search_results(word):
    # Performs a Google search for the given word and retrieves the top 25 results,
    # returning them as a list. Handles exceptions if the search does not yield results.
    try:
        search_results = search(word, num=25, stop=25, pause=2.0)
        return list(search_results)
    except StopIteration:
        return []

def extract_keywords(text):
    # Tokenizes the text into words and filters out common stopwords and non-alphanumeric characters,
    # then returns the five most common words.
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    fdist = FreqDist(filtered_words)
    return [word for word, _ in fdist.most_common(5)]

def classify_relevance(search_results, keywords):
    # Evaluates the relevance of search results based on the occurrence of keywords in the title.
    # Returns a list of binary values indicating relevance.
    relevance = []
    for result in search_results:
        title = result.split(' - ')[0].lower()
        is_relevant = any(keyword in title for keyword in keywords)
        relevance.append(1 if is_relevant else 0)
    return relevance

def train_classifier(X, y):
    # Trains a logistic regression classifier using the given data.
    # Returns both the trained classifier and the vectorizer used for feature extraction.
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X)
    clf = LogisticRegression()
    clf.fit(X_train, y)
    return clf, vectorizer

def find_resources(text):
    # Extracts keywords from the text, searches for related web pages, and evaluates their relevance
    # based on the presence of keywords in the titles. Returns the most relevant results.
    keywords = extract_keywords(text)
    search_results = get_search_results(' '.join(keywords))
    relevance_labels = classify_relevance(search_results, keywords)
    X_train, _, y_train, _ = train_test_split(search_results, relevance_labels, test_size=0.2, random_state=42)
    clf, vectorizer = train_classifier(X_train, y_train)
    X_test = vectorizer.transform(search_results)
    predicted_labels = clf.predict(X_test)
    relevant_results = [result for result, label in zip(search_results, predicted_labels) if label == 1]
    return relevant_results

# Find relevant resources
relevant_resources = find_resources(summary)
print("Relevant resources:")
for resource in relevant_resources:
    print(resource)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Relevant resources:
https://www.toptal.com/designers/ui/gestalt-principles-of-design
https://www.painting-course.com/the-painting-course-1/lesson-20-gestalt-principles-of-art-and-design
https://www.interaction-design.org/literature/topics/gestalt-principles
https://www.interaction-design.org/literature/topics/gestalt-principles#what_are_the_gestalt_principles?-0
https://www.interaction-design.org/literature/topics/gestalt-principles#gestalt_principles_%E2%80%93_a_background-1
https://www.interaction-design.org/literature/topics/gestalt-principles#gestalt_principles-2
https://webflow.com/blog/gestalt-principles-of-design
https://graybox.co/knowledge/blog/gestalt-principles-applied-to-design
https://picsart.com/blog/post/gestalt-principles-for-design
https://www.shutterstock.com/blog/gestalt-theory-in-design
https://in.indeed.com/career-advice/career-development/gestalt-principles
https://www.usertesting.com/blog/gestalt-principles
https://www.superside.com/blog/gestalt-principles-of-des

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

tokens = word_tokenize(transcript)
common_words = set(stopwords.words('english'))
filtered = [word for word in tokens if word.lower() not in common_words]
filtered = [word for word in filtered if word not in punctuation]
filtered_text = ' '.join(filtered)
print(filtered_text)

Gestalt theory important ask well know Gestalt Theory 'll able suggest ideas without drawing yes friends secret sauce artistic mastery nine first though let 's talk elements principles want go case elements art seven line space color shape one form texture value seven elements art principles design 's eight technically there's ten combined four balance contrast movement rhythm repetition emphasis unity variety unity grouped harmony proportions n't remember elements art principles design entire playlist explain detail try funny 'll link description watch next principles design principles design stuff make artwork feel better think elements like nouns 're nouns artwork right 's objects things think principles adjectives adjectives verbs right 's things make nouns move makes work makes interesting know elements art know principles design know 's third one Gestalt theory weird one elements art nouns principles design adjectives verbs Gestalt theory context sentence say sentence whenever he

In [16]:
from collections import Counter

# Each word gets counted for frequency
word_frequency = Counter(filtered)

# Get the 25 most common words
mcw = word_frequency.most_common(25)

for t in mcw:
    word, frequency = t
    # Contractions that show up in word count
    if word in ["’", "'s", '``', "n't", "''", "'re"]:
        mcw.remove(t)

print("5 Most Common Words:")
for word, frequency in mcw:
    print(f"{word}: {frequency}")

#Words are the manually picked out based on if they are still too general or not; top 5 are chosen

5 Most Common Words:
one: 34
like: 26
right: 23
elements: 17
got: 15
grouped: 14
kind: 14
Gestalt: 13
know: 12
group: 12
dots: 12
bunch: 11
together: 11
theory: 10
principles: 10
little: 10
another: 10
art: 9
think: 9
feels: 9
going: 9
let: 8
feel: 8


In [None]:
import numpy as np

# Actual most common keywords for each video
actual_keywords = [
    {'lincoln', 'south', 'mcclellen', 'north', 'slavery'},
    {'true', 'statement', 'statements', 'mathematical', 'gödel'},
    {'electricity', 'power', 'current', 'electric', 'electrons'},
    {'nietzsche', 'life', 'suffering', 'philosophy', 'power'},
    {'elements', 'grouped', 'gestalt', 'theory', 'principles'}
]

# Sets of keywords generated from summaries for each trial (3x each for each 5 videos)
generated_keywords = [
    [
        {'war', 'video', 'slavery', 'lincoln', 'abraham'},
        {'video', 'lincoln', 'slavery', 'war', 'battle'},
        {'war', 'lincoln', 'video', 'american', 'slavery'}
    ],
    [
        {'mathematical', 'gödel', 'statements', 'statement', 'unprovable'},
        {'gödel', 'mathematical', 'theorem', 'statements', 'statement'},
        {'statements', 'mathematical', 'gödel', 'unprovable', 'statement'}
    ],
    [
        {'electricity', 'power', 'electric', 'plants', 'like'},
        {'electricity', 'power', 'plants', 'current', 'video'},
        {'electricity', 'power', 'plants', 'electric', 'electrons'}
    ],
    [
        {'nietzsche', 'personal', 'philosophy', 'work', 'modern'},
        {'nietzsche', 'philosophy', 'personal', 'life', 'suffering'},
        {'nietzsche', 'personal', 'modern', 'traditional', 'faith'}
    ],
    [
        {'elements', 'gestalt', 'theory', 'create', 'principles'},
        {'elements', 'principles', 'sense', 'visual', 'perceived'},
        {'elements', 'gestalt', 'dots', 'theory', 'principles'}

    ]
]

def euclidean_distance(set1, set2, all_keywords):
    # Sets to vectors
    v1 = np.array([1 if keyword in set1 else 0 for keyword in all_keywords])
    v2 = np.array([1 if keyword in set2 else 0 for keyword in all_keywords])

    # Calculate Euclidean between actual and generated keywords
    distance = np.linalg.norm(v1 - v2)
    return distance

# Calculate the maximum possible Euclidean distance (when all keywords are different)
max_distance = len(set.union(*actual_keywords)) ** 0.5

# All unique keywords across all trials
all_keywords = set.union(*actual_keywords)

# Calculate the Euclidean distance between generated keywords and the actual keywords for each trial
total = 0
for i, j in enumerate(generated_keywords):
    distances = [euclidean_distance(set(keywords), actual_keywords[i], all_keywords) for keywords in j]

    # Normalizing by converting Euclidean distances to percentages from max_distance
    percentages = [(1 - d / max_distance) * 100 for d in distances]

    print("Trial", i+1, "Percentages:", percentages)
    print("Average Percentage Similarity for Trial", i+1, ":", sum(percentages) / len(percentages))
    total += sum(percentages) / len(percentages)

print("Average Percentage Similarity across all trials:", total/5)


Trial 1 Percentages: [64.64466094067262, 64.64466094067262, 64.64466094067262]
Average Percentage Similarity for Trial 1 : 64.64466094067262
Trial 2 Percentages: [79.58758547680685, 79.58758547680685, 79.58758547680685]
Average Percentage Similarity for Trial 2 : 79.58758547680685
Trial 3 Percentages: [71.13248654051871, 71.13248654051871, 79.58758547680685]
Average Percentage Similarity for Trial 3 : 73.95085285261476
Trial 4 Percentages: [64.64466094067262, 79.58758547680685, 59.17517095361369]
Average Percentage Similarity for Trial 4 : 67.80247245703106
Trial 5 Percentages: [79.58758547680685, 64.64466094067262, 71.13248654051871]
Average Percentage Similarity for Trial 5 : 71.78824431933272
Average Percentage Similarity across all trials: 71.5547632092916


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculate Cosine Similarity between actual and generated keywords
# Did this 3x per each 5 videos
def calculate_similarity(transcript, summary):
    v = CountVectorizer().fit([transcript, summary])
    vectors = v.transform([transcript, summary]).toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]  # Similarity between transcript and summary


similarity = calculate_similarity(transcript, summary)
print("Cosine Similarity:", similarity)

# Results from each video
#The paradox at the heart of mathematics: Gödel's Incompleteness Theorem - Marcus du Sautoy: 0.7180296285070443, 0.6728328781718516, 0.7338418450881288
#The American Civil War - OverSimplified (Part 1): 0.807358649078939, 0.8015141569751579, 0.8139546614076966
#How It's Made: Chocolate: 0.8498555587739476, 0.8386061761006591, 0.8477837984991412
#Becoming Who You Really Are - The Philosophy of Friedrich Nietzsche: 0.8849369398566, 0.8439721404374021, 0.8187018680276592
#GESTALT Theory in Art: The Power of Suggestion!: 0.5801535627075831, 0.5553691344949546, 0.6019762112438968

Cosine Similarity: 0.6019762112438968


In [None]:
# Results from above
cos_sim = [
    [0.7180296285070443, 0.6728328781718516, 0.7338418450881288],
    [0.807358649078939, 0.8015141569751579, 0.8139546614076966],
    [0.8498555587739476, 0.8386061761006591, 0.8477837984991412],
    [0.8849369398566, 0.8439721404374021, 0.8187018680276592],
    [0.5801535627075831, 0.5553691344949546, 0.6019762112438968]
]

# Average the Cosine Similarities
total = 0
for i in cos_sim:
  vid_sum = 0
  for j in i:
    vid_sum += j
  total += (vid_sum/3)
  print(vid_sum/3)
print("total accuracy: ", (total/5))


0.7082347839223416
0.8076091558205979
0.8454151777912493
0.8492036494405538
0.5791663028154782
total accuracy:  0.7579258139580443
