In [None]:
## Dependencies
"""
pip install youtube-transcript-api
pip install youtube-transcript-api --upgrade
pip install --upgrade youtube-transcript-api
pip install nltk
pip install pandas
"""

In [None]:
# Importing libraries 

import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import tiktoken
from langchain_text_splitters import RecursiveCharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi

# Calling API the youtube video transcripts

In [3]:
api = YouTubeTranscriptApi()

def get_english_transcript(video_id: str) -> str | None:
    try:
        data = api.fetch(video_id, ["en"])
        return " ".join(s.text for s in data)
    except NoTranscriptFound:
        print(f"Skipping {video_id} (no English transcript)")
        return None

In [4]:
video_ids = ["YBF9c2mCGME", "G0NCHag1rKc", "epgQ-sAr0l8", "3EgYr7jR4NI", "_AadMC3mzSk"]

records = []
for vid in video_ids:
    text = get_english_transcript(vid)
    if text is None:
        continue
    records.append({"video_id": vid, "transcript": text})

df = pd.DataFrame(records)

In [5]:

print(df.head())

      video_id                                         transcript
0  YBF9c2mCGME  Are you struggling to pass the CompTIA Securit...
1  G0NCHag1rKc  [Music] welcome to the full Security Plus cour...
2  epgQ-sAr0l8  this is how I would study for the Security Plu...
3  3EgYr7jR4NI  DION: Hello, and\nwelcome to the course. I am ...
4  _AadMC3mzSk  in this video we will break down every single ...


In [None]:
## Preprocesing, chunking and creation of the dataframe for the youtube video transcripts

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace('\ufeff', '')
    text = re.sub(r"\s+", " ", text)
    text = text.replace("[Music]", "").replace("[music]", "").strip()
    # keep your current tokenization / cleaning here if you still want it
    return text

# 1) tiktoken length function (for token-based chunk size)
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text: str) -> int:
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

# 2) create the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,          # about 400 tokens
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# 3) split each transcript into chunks
df["chunks"] = df["transcript"].apply(lambda text: text_splitter.split_text(text))

# 4) preprocess EACH chunk and keep them as a list
def preprocess_chunk_list(chunk_list):
    return [preprocess_text(chunk) for chunk in chunk_list]

df["Processed_chunks"] = df["chunks"].apply(preprocess_chunk_list)

# 5) explode into multiple rows (one per processed chunk)
df_exploded = df.explode("Processed_chunks")

# 6) add chunk number per video_id
df_exploded["chunk_number"] = df_exploded.groupby("video_id").cumcount() + 1

# 7) rename the exploded column for clarity
df_exploded = df_exploded.rename(columns={"Processed_chunks": "Processed_Text_chunk"})

df_exploded.head()

Unnamed: 0,video_id,transcript,chunks,Processed_Text_chunk,chunk_number
0,YBF9c2mCGME,Are you struggling to pass the CompTIA Securit...,[Are you struggling to pass the CompTIA Securi...,Are you struggling to pass the CompTIA Securit...,1
0,YBF9c2mCGME,Are you struggling to pass the CompTIA Securit...,[Are you struggling to pass the CompTIA Securi...,for lateral movement or data. So let me furthe...,2
0,YBF9c2mCGME,Are you struggling to pass the CompTIA Securit...,[Are you struggling to pass the CompTIA Securi...,server and if due to any reason one of the ser...,3
0,YBF9c2mCGME,Are you struggling to pass the CompTIA Securit...,[Are you struggling to pass the CompTIA Securi...,available for implementing security controls. ...,4
0,YBF9c2mCGME,Are you struggling to pass the CompTIA Securit...,[Are you struggling to pass the CompTIA Securi...,access and manage the organization resources. ...,5


In [None]:
# 8) Defining final dataframe
clean_text = df_exploded[["video_id","chunk_number", "Processed_Text_chunk"]]
print(clean_text.head())

      video_id  chunk_number  \
0  YBF9c2mCGME             1   
0  YBF9c2mCGME             2   
0  YBF9c2mCGME             3   
0  YBF9c2mCGME             4   
0  YBF9c2mCGME             5   

                                Processed_Text_chunk  
0  Are you struggling to pass the CompTIA Securit...  
0  for lateral movement or data. So let me furthe...  
0  server and if due to any reason one of the ser...  
0  available for implementing security controls. ...  
0  access and manage the organization resources. ...  


In [None]:
# 9) Obtaining CSV file
clean_text.to_csv("clean_text.csv", index=False)