Setting Up the API KEY

In [1]:
API_KEY = "AIzaSyCNywAuEMOiGUkRNQa8u5UyFKPq1As8sBQ"

Installing Dependencies

In [None]:
%pip install youtube_channel_transcript_api
%pip install --upgrade google-api-python-client
%pip install --upgrade google-auth-oauthlib google-auth-httplib2

In [2]:
from youtube_channel_transcript_api import *
import os
from pprint import pprint

Loading Captions

In [3]:
PLAYLIST_ID = "PLTjRvDozrdlxj5wgH4qkvwSOdHLOCx10f" 
channel_getter = YoutubePlaylistTranscripts("Some Gibberish Name",PLAYLIST_ID, API_KEY) #channel getter is a YoutubePlaylistTranscripts Object
# channel_getter is an object of 'YoutubePlaylistTranscripts' Type

# for index, item in enumerate(channel_getter.video):
#     print(f"{index+1}. Video: {item[0]}, ID: {item[1]}")


    

Fetching videos data


In [4]:
videos_data, videos_errored = channel_getter.get_transcripts(languages=['en'])

print(f'Number of videos loaded: {len(channel_getter.video)}')
print(f'Number of videos data fetched: {len(videos_data)}')
print(f'Number of videos data errored: {len(videos_errored)}')


Number of videos loaded: 21
Number of videos data fetched: 21
Number of videos data errored: 0


Creating a backup on storage

In [5]:
import pickle
with open(f'./content/{PLAYLIST_ID}_vids_data_processed.pkl', 'wb') as f:
    pickle.dump(videos_data, f)
with open(f'./content/{PLAYLIST_ID}_vids_data_errored.pkl', 'wb') as f:
    pickle.dump(videos_errored, f)

Run this cell to get a list of videos loaded from the playlist


In [7]:
print(f'Number of videos loaded from playlist: {len(videos_data)}')
print('List of loaded videos:')

# for index, item in enumerate(videos_data):
#     print(f'{index+1}. Video ID: {item}        Title:', videos_data[item]['title'])


Number of videos loaded from playlist: 21
List of loaded videos:


In [8]:
print(f'Number of videos loaded from playlist: {len(videos_data)}')
print('List of non-loaded videos:')
print(videos_errored)

Number of videos loaded from playlist: 21
List of non-loaded videos:
[]


Creating caption dataset on storage

In [9]:
ROOT_FOLDER = "./content/playlists"
CHANNEL_DIRECTOR_NAME = PLAYLIST_ID

SAVE_FOLDER = os.path.join(ROOT_FOLDER, CHANNEL_DIRECTOR_NAME)


In [10]:
# videos_data.values()

Storing video captions

In [11]:
for vid_obj in videos_data.values():
  TITLE = vid_obj['title']
  #windows doesn't allow all the special characters to be there in the folder name
  # Let's remove the special characters from the title

  TITLE = TITLE.replace("?",'')   #windows doesn't support '?'
  TITLE = TITLE.replace("|",'')   #windows doesn't support '|'

  VID_FOLDER = os.path.join(SAVE_FOLDER, TITLE)
  # print(f'VID_FOLDER: {VID_FOLDER}')
  vid_exists = os.path.exists(VID_FOLDER)   # checking whether the video directory exists
  # print(f'vid_exists: {vid_exists}')
  os.makedirs(VID_FOLDER) if not vid_exists else None   # if the directory doesn't exist, create one

  vid_captions = vid_obj['captions'] 

  full_vid_captions = [f'Title: {TITLE}']  #This list will have all the captions in the video without the time stamps
  #The below code can be modified to include time
  for caption in vid_captions:
    full_vid_captions.append(caption['text'])   #full video captions is the list of caption strings

  full_vid_captions = " ".join(full_vid_captions)   # this returns a single string of complete video caption

  with open(os.path.join(VID_FOLDER, f'{TITLE}_captions.txt'), 'w') as f:
    f.write(full_vid_captions)

Utilities

In [61]:
import re

def clean_document(text:str) -> str:
    # this function tries to clean the text by removing multiple new lines, adding paragraph breaks, and removing empty paragraphs

    # getting rid of all new lines
    while '\n' in text:
        text = text.replace('\n', '')

    # will add some features here in future
    return text

Defining the Document Class

In [62]:
# adding imports
import hashlib
import mmh3
from typing import List
import logging

In [63]:
class Document:
    def __init__(self, meta, hash_id, title:str, content:str, language:str = 'English', score:float = None, hash_id_keys:List[str] = None):
        self.title = title
        self.content = content
        self.language = language
        self.hash_id_keys = hash_id_keys
        self.meta = meta

        if hash_id is None: 
            self.hash_id = self.generate_hash(hash_id_keys)
        else:
            self.hash_id = hash_id

    def generate_hash(self, hash_id_keys):
        return "{:02x}".format(mmh3.hash128(str(self.content), signed=False))

    def __str__(self):
        return (f"Title: {self.title}\nContent: {self.content}\nLanguage: {self.language}\nHash ID: {self.hash_id}")


Splitting Utility

In [76]:
def split_documents(document:Document, split_length:int = 100):
    text = document.content

    line = ''
    text_chunks = []

    words = text.split(' ')[:-1]

    # print(words)

    for word in words:
        if len(line) >= split_length:
            text_chunks.append(line)
            line = ''

        else:
            line += ' ' + word
            
    # for sentence in (s.strip() + '.' for s in text.split('.')[:-1]):   
    #     if len(line.split()) + len(sentence.split()) + 1 >= split_length:   # can't fit on that line => start a new one
    #         text_chunks.append(line)
    #         line = sentence
            
    #     else:       # can fit it => add a space and then the sentence
    #         line += '' + sentence

    # print(f'text chnks are: {text_chunks}')

    documents = []
    for i, txt in enumerate(text_chunks):
        doc = Document(title = document.title, content = txt, hash_id = None, hash_id_keys=None, meta = {'filename': document.meta.copy()} or {})
        # I need to implement meta data here
        doc.meta["_split_id"] = i
        doc.meta["_parent_hash"] = document.hash_id
        documents.append(doc)
        
    
    return documents

Testing Code

In [None]:
f = open('./content/playlists\PLTjRvDozrdlxj5wgH4qkvwSOdHLOCx10f\Control Flow in Python - If Elif Else Statements\Control Flow in Python - If Elif Else Statements_captions.txt', 'r')
content = f.read()
obj = Document(title = 'hi', content = content, meta = {'file_name': f'Control Flow in Python - If Elif Else Statements_captions.txt'} , hash_id = None, hash_id_keys = None)
obj.content = clean_document(obj.content)
docs = split_documents(obj,  split_length = 1000)

for doc in docs:
    print(doc.content)




Preprocessor

In [83]:
next_folder = os.path.join(SAVE_FOLDER, os.listdir(SAVE_FOLDER)[0])
file_path = os.path.join(next_folder, f'{os.listdir(SAVE_FOLDER)[0]}_captions.txt')

parent_document = {}    # storing document objects with the hashid:object 
document_list = []      # this list stores all the document objects
split = True

# crawler
for folder in os.listdir(SAVE_FOLDER):
    # opening the files
    next_folder = os.path.join(SAVE_FOLDER, folder)
    file_path = os.path.join(next_folder, f'{folder}_captions.txt')
    

    try:
        f = open(file_path, 'r')
    except:
        logging.error(f"The file {file_path} cannot be opened.")
    
    # creating document object 
    content = f.read()
    obj = Document(title = folder, content = content, meta = {'file_name': f'{folder}_captions.txt'} , hash_id = None, hash_id_keys = None)

    # cleaning the object content
    obj.content = clean_document(obj.content)

    # storing the content in the dictionary
    parent_document[obj.hash_id] = obj


    # if split is needed, we split else we directly append to the list
    if split:
        # split_document returns a list of document objects
        documents = split_documents(obj, split_length = 1000)


        # appending the list of document objects to our main list
        for d in documents:
            document_list.append(d)
        
    else:
        document_list.append(obj)

    

ERROR:root:The file ./content/playlists\PLTjRvDozrdlxj5wgH4qkvwSOdHLOCx10f\The 3 MOST IMPORTANT JAZZ SCALES and how similar they are\The 3 MOST IMPORTANT JAZZ SCALES and how similar they are_captions.txt cannot be opened.


In [81]:
for doc in document_list:
    print(doc.title)


Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
Control Flow in Python - If Elif Else Statements
How to Loop over Lists in Python
How to Loop over Lists in Python
How to Sort Lists in Python - Python Tutorial for Absolute Beginners  Mosh
How to Sort Lists in Python - Python Tutorial for Absolute Beginners  Mosh
How to Sort Lists in Python - Python Tutorial for Absolute Beginners  Mosh
How to Sort Lists in Python - Python Tutorial for Absolute Beginners  Mosh
How to Sort Lists in Python - Python Tutorial for Absolute Beginners  Mosh
How to Use Lists in 