<a href="https://colab.research.google.com/github/getcher123/YouTube-Subtitle-Extractor/blob/main/youtube_subtitle_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YouTube Subtitle Extractor

This script extracts subtitles of YouTube videos in English and Russian languages, cleans the text and saves them into an Excel file for parallel corpus creation.

## Prerequisites

The following packages are required to run the script:
- pandas
- re
- os
- itertools
# - YouTubeTranscriptApi (Install using `!pip install youtube_transcript_api`)

## Usage

1. Set the `video_ids` and `channelId` variables to the list of video IDs and YouTube channel ID for which you want to extract subtitles.
2. Run the script in your preferred Python environment.

The script will extract subtitles for the specified video IDs, clean the text, compare English and Russian subtitles to remove any discrepancies, and save the final result in an Excel file in the specified directory.

The saved Excel file will have two columns:
- `en` - English subtitle
- `ru` - Russian subtitle

## Notes

- If a video doesn't have English or Russian subtitles, it will be skipped.
- The script splits subtitles by sentence boundaries and cleans the text by removing unnecessary characters such as `..., “, ’, etc`.
- To remove discrepancies between English and Russian subtitles, the script compares the timestamps in the subtitles and deletes the sentence that doesn't have a matching timestamp in the other language. If there are multiple discrepancies, it may leave some of them unpaired.

In [None]:
# Mount my Google Drive (storage)
from google.colab import drive
drive.mount('/content/gdrive')

# data dir
import os
data_dir = '/content/gdrive/MyDrive/subtitles'  # Your data directory in Colab 
os.listdir(data_dir)

In [None]:
# install libraries
!pip install google-api-python-client
!pip install youtube_transcript_api

# importing libraries 
import pandas
import json
from googleapiclient.discovery import build # Google API request
from youtube_transcript_api import YouTubeTranscriptApi

# Enter your YouTube api key
api_key = '' 
# If the address of the Youtube is https://youtu.be/zOjov-2OZ0E then, the video id is tha last part "zOjov-2OZ0E".

# Get subs from one video ID

In [None]:
video_id = 'E21kilDE8jY' 

subtitles_en = YouTubeTranscriptApi.get_transcript(video_id, languages=['en']) # English subtitle
subtitles_ru = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])

prompts = []
words = []
for subtitle in subtitles: 
  time = subtitle['start']
  prompt = subtitle['text']
  words.append(prompt)
  prompts.append([time, prompt])

df = pandas.DataFrame(prompts, columns =['en', 'ru'])
df.to_excel(os.path.join(data_dir,f'{video_id}.xlsx'), index=None)

# Get all videos ID from channel

In [None]:
channelId = "UCBobmJyzsJ6Ll7UbfhI4iwQ"
youtube = build('youtube','v3',developerKey= api_key)

# getting all video details
contentdata = youtube.channels().list(id=channelId,part='contentDetails').execute()
playlist_id = contentdata['items'][0]['contentDetails']['relatedPlaylists']['uploads']
videos = []
next_page_token = None

while 1:
    res = youtube.playlistItems().list(playlistId=playlist_id,part='snippet',maxResults=50,pageToken=next_page_token).execute()
    videos += res['items']
    next_page_token = res.get('nextPageToken')
    if next_page_token is None:
        break

# getting video id for each video
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
# video_ids = video_ids[:4]
video_ids

# Get subtitles by lines

In [None]:
import pandas as pd
import re
import os
from itertools import zip_longest

# list to hold all prompts
prompts = []

# iterate over each video ID
for video_id in video_ids:

    try:
        subtitles_en = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        subtitles_ru = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])
    except Exception as e:
        print(f"Exception occurred: {e}")
        continue
    list_sub_en = []
    list_sub_ru = []
    # iterate over each subtitle entry and add to prompts list
    for sub_en, sub_ru in zip(subtitles_en, subtitles_ru):
        text_en = sub_en['text']
        text_ru = sub_ru['text']
        list_sub_en.append(text_en)
        list_sub_ru.append(text_ru)
    max_len = max(len(list_sub_en), len(list_sub_en))
    data = list(zip_longest(list_sub_en, list_sub_en, fillvalue=None))
    df = pd.DataFrame(data, columns=['en', 'ru'])
    sufix = ""
    if len(list_sub_en) != len(list_sub_ru): sufix = f"!error_{len(list_sub_en)}-{len(list_sub_ru)}_"
    with pd.ExcelWriter(os.path.join(data_dir, f'{sufix}lines_{channelId}_{video_id}.xlsx')) as writer:
        df.to_excel(writer, index=False)
    
#sentences = combined_sub.split('\n')
#combined_subtitles.extend(sentences)


# create dataframe from prompts list
#df = pd.DataFrame(prompts, columns=['en', 'ru'])

# save dataframe to Excel file
#with pd.ExcelWriter(os.path.join(data_dir, f'{channelId}.xlsx')) as writer:
#    df.to_excel(writer, index=False)

# Get subtitles by sentences

In [None]:
import pandas as pd
import re
import os
from itertools import zip_longest




def extract_numbers(s):
    pattern = r'\[(\d+\.\d+)\]'
    numbers = re.findall(pattern, s)
    cleaned_s = re.sub(pattern, '', s)
    return numbers, cleaned_s

def compareLists(list1, list2):
  i = 0
  while i < min(len(list1), len(list2)):
    numbers1_1, cleaned_s1_1 = extract_numbers(list1[i])
    numbers2_1, cleaned_s2_1 = extract_numbers(list2[i])
    if numbers1_1 == numbers2_1:
        list1[i] = cleaned_s1_1
        list2[i] = cleaned_s2_1
        i += 1
        continue
        
    del list1[i]
    del list2[i]
    if i < min(len(list1), len(list2)) :
        numbers1_1, cleaned_s1_1 = extract_numbers(list1[i])
        numbers2_1, cleaned_s2_1 = extract_numbers(list2[i])
        try:
          numbers1_2, cleaned_s1_2 = extract_numbers(list1[i+1])
        except:
          numbers1_2 = ""
        try:
          numbers2_2, cleaned_s2_2 = extract_numbers(list2[i+1])
        except:
          numbers2_2 = ""
        if numbers1_1 == numbers2_2:
            del list2[i]
        elif numbers2_1 == numbers1_2:
            del list1[i]
  return list1, list2

# list to hold all prompts
prompts = []

# iterate over each video ID
for video_id in video_ids:
    print(video_id)

    try:
        subtitles_en = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        subtitles_ru = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])
    except:
        # skip video if it doesn't have English or Russian subtitles
        continue
    combined_sub_en = ""
    combined_sub_ru = ""
    # iterate over each subtitle entry and add to prompts list
    for sub_en, sub_ru in zip(subtitles_en, subtitles_ru):
        text_en = sub_en['text']
        time_en = sub_en['start']

        text_ru = sub_ru['text']
        time_ru = sub_ru['start']

        text_en = re.split(r'(?<=[!?.])+(?=[A-ZА-Я])', text_en)
        text_en = " ".join(text_en)
        text_ru = re.split(r'(?<=[!?.])+(?=[A-ZА-Я])', text_ru)
        text_ru = " ".join(text_ru)
        text_en = re.sub(r'\{\an\d+\}\s*', '', text_en)
        text_ru = re.sub(r'\{\an\d+\}\s*', '', text_ru)

        combined_sub_en += (text_en.replace('\xa0', ' ').replace('\n', ' ').replace('...', ',').replace('…', ',').replace('"', '').replace('\'', '') + ' ').replace('  ', ' ').replace('  ', ' ') + f"[{time_en}]"
        combined_sub_ru += (text_ru.replace('\xa0', ' ').replace('\n', ' ').replace('...', ',').replace('…', ',').replace('«', '').replace('»', '').replace('"', '') + ' ').replace('  ', ' ').replace('  ', ' ') + f"[{time_ru}]"

#        sentences_sub_en = re.split(r'(?<![!?\.])[!?.]\s', combined_sub_en)
#        sentences_sub_ru = re.split(r'(?<![!?\.])[!?.]\s', combined_sub_ru)

    sentences_sub_en = re.split(r'(?<=[!?.])\s', combined_sub_en)
    sentences_sub_ru = re.split(r'(?<=[!?.])\s', combined_sub_ru)
    
    sentences_sub_en, sentences_sub_ru = compareLists(sentences_sub_en, sentences_sub_ru)


    sentences_sub_en = [elem for elem in sentences_sub_en if elem]
    sentences_sub_ru = [elem for elem in sentences_sub_ru if elem]

    print(video_id)
    print(len(sentences_sub_en))
    print(len(sentences_sub_ru))
    print("------------------")

    max_len = max(len(sentences_sub_en), len(sentences_sub_ru))
    data = list(zip_longest(sentences_sub_en, sentences_sub_ru, fillvalue=None))
    df = pd.DataFrame(data, columns=['en', 'ru'])
    sufix = ""
    if len(sentences_sub_en) != len(sentences_sub_ru): sufix = f"!error_{len(sentences_sub_en)}-{len(sentences_sub_ru)}_"
    with pd.ExcelWriter(os.path.join(data_dir, f'{sufix}snt_{channelId}_{video_id}.xlsx')) as writer:
        df.to_excel(writer, index=False)
    
#sentences = combined_sub.split('\n')
#combined_subtitles.extend(sentences)


# create dataframe from prompts list
#df = pd.DataFrame(prompts, columns=['en', 'ru'])

# save dataframe to Excel file
#with pd.ExcelWriter(os.path.join(data_dir, f'{channelId}.xlsx')) as writer:
#    df.to_excel(writer, index=False)

NC8wLmW-owY
wanToB8YF_c
tLhcS1pAROg
dwcaELjcwdA
LfAV-OI4ygo
3PBnqC7TxvM
NMalVTkoy2c
BrRA5qdh3RQ
C98Dyx8h-H0
kT4iWCxu5hA
eI1blbMi_KM
bLQM6VigTZg
4XSrzzQe9gM
QFJnKSfd1eM



# Further test cells, they are not needed

In [None]:
list1 = ['apple', 'banana [1.1]', 'cherry[2.1]',  'banana1', 'banana2',       'apple1[1.23]',  'banana1[12.2]', 'banana5[1.22]']
list2 = ['apple', 'banana [1.1]', 'cherry[2.2]',             'banana3',       'apple2[1.23]',  'banana2[12.3]', 'banana2', 'banana6[1.22]']

def compareLists(list1, list2):
  i = 0
  while i < min(len(list1), len(list2)):
    numbers1_1, cleaned_s1_1 = extract_numbers(list1[i])
    numbers2_1, cleaned_s2_1 = extract_numbers(list2[i])
    print(i)
    if numbers1_1 == numbers2_1:
        print(f"{list1[i] = } {list2[i] = }")
        list1[i] = cleaned_s1_1
        list2[i] = cleaned_s2_1
        i += 1
        continue
        
    del list1[i]
    del list2[i]
    if i < min(len(list1), len(list2)) :
        numbers1_1, cleaned_s1_1 = extract_numbers(list1[i])
        numbers2_1, cleaned_s2_1 = extract_numbers(list2[i])
        try:
          numbers1_2, cleaned_s1_2 = extract_numbers(list1[i+1])
        except:
          numbers1_2 = ""
        try:
          numbers2_2, cleaned_s2_2 = extract_numbers(list2[i+1])
        except:
          numbers2_2 = ""
        if numbers1_1 == numbers2_2:
            del list2[i]
        elif numbers2_1 == numbers1_2:
            del list1[i]
  return list1, list2
        
list1, list2 = compareLists(list1, list2)

print(list1)
print(list2)

0
list1[i] = 'apple' list2[i] = 'apple'
1
list1[i] = 'banana [1.1]' list2[i] = 'banana [1.1]'
2
2
list1[i] = 'banana2' list2[i] = 'banana3'
3
list1[i] = 'apple1[1.23]' list2[i] = 'apple2[1.23]'
4
4
list1[i] = 'banana5[1.22]' list2[i] = 'banana6[1.22]'
['apple', 'banana ', 'banana2', 'apple1', 'banana5']
['apple', 'banana ', 'banana3', 'apple2', 'banana6']


In [None]:
import re

def extract_numbers(s):
    pattern = r'\[(\d+\.\d+)\]'
    numbers = re.findall(pattern, s)
    cleaned_s = re.sub(pattern, '', s)
    return numbers, cleaned_s

s = 'Альфа — это изображения в оттенках серого с информацией о глубине, [83.41]которые мы можем присвоить кистям для скульптинга, чтобы быстро вылепить детали и ускорить весь [87.34]процесс скульптинга.'
numbers, cleaned_s = extract_numbers(s)
print(numbers)  # output: ['1080.24', '234.4']
print(cleaned_s)  # output: 'И с помощью кисти Nudge я немного сформировал поток волос или меха, так как мне нравится.'

['83.41', '87.34']
Альфа — это изображения в оттенках серого с информацией о глубине, которые мы можем присвоить кистям для скульптинга, чтобы быстро вылепить детали и ускорить весь процесс скульптинга.
