In [197]:
## Libraries
from bs4 import BeautifulSoup
from pytube import Playlist
from pytube import YouTube
import pandas as pd
import datefinder
import urllib
import math
import time
import os
import re

In [220]:
## Download videos
## Useful documentation: https://pytube.io/en/latest/api.html#youtube-object
def download_360p_mp4_videos(url: str, path: str):
    yt = YouTube(url)
    yt.streams.filter(file_extension="mp4").get_by_resolution("360p").download(filename=path)

def download_videos(playlist_url, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    playlist = Playlist(playlist_url)
    print('Number of videos in playlist: %s' % len(playlist.video_urls))
    df = pd.DataFrame()
    count = 1
    for url in playlist.video_urls:
        filename = "video_" + str(count) + ".mp4"
        if not os.path.exists("./" + output_dir + "/" + filename):
            try:
                download_360p_mp4_videos(url, "./" + output_dir + "/video_" + str(count) + ".mp4")
            except urllib.error.HTTPError:
                print("ERROR CAUGHT")
                time.sleep(20)
                download_360p_mp4_videos(url, "./" + output_dir + "/video_" + str(count) + ".mp4")
        count += 1
        if count % 10 == 0:
            print("DONE", count)
            # time.sleep(4)

## Download metadata
def get_playlist_metadata(playlist_url, csv_filename, output_dir):
    transcriptions_output_dir = output_dir + "/Transcriptions"
    if not os.path.exists(transcriptions_output_dir):
        os.mkdir(transcriptions_output_dir)
    playlist = Playlist(playlist_url)

    print('Number of videos in playlist: %s' % len(playlist.video_urls))
    df = pd.DataFrame()

    if os.path.exists(csv_filename):
        df = pd.read_csv(csv_filename)
    else:
        df = pd.DataFrame(columns=["video_filename",
                                    "url",
                                    "title",
                                    "length_seconds",
                                    "publish_date",
                                    "description",
                                    "keywords",
                                    "date",
                                    "date_src",
                                    "transcription_filename"])

    count = 1
    for url in playlist.video_urls:
        filename = "video_" + str(count) + ".mp4"

        if filename not in df["video_filename"].tolist():
            video = YouTube(url)
            filename = "video_" + str(count) + ".mp4"

            date = ""
            date_src = ""
            title = video.title
            description = video.description
            if "Streamed live on " in video.watch_html:
                date_tmp = video.watch_html.split("Streamed live on ")[1].split('"')[0]
                matches = datefinder.find_dates(date_tmp)
                for match in matches:
                    date = match.strftime("%d.%m.%Y")
                    date_src = "live_stream"
            elif (match := re.search(r'\d\d\.\d\d\.\d\d\d\d', title)) is not None:
                date = match.group()
                date_src = "title"
            elif (match := re.search(r'\d\d\.\d\d\.\d\d\d\d', description)) is not None:
                date = match.group()
                date_src = "description"

            transcription_filename = ""
            if 'a.tr' in video.captions.keys():
                soup = BeautifulSoup(video.captions['a.tr'].xml_captions, 'xml')
                transcriptions = ""
                for s in soup.find_all('s'):
                    transcriptions += s.text
                transcription_filename = "transcription_" + str(count) + ".txt"
                text_file = open(transcriptions_output_dir + '/' + transcription_filename, "w", encoding="utf-8")
                n = text_file.write(transcriptions)
                text_file.close()
            
            new_row = { 'video_filename': filename,
                        'url': url,
                        'title': title,
                        'length_seconds': video.length,
                        'publish_date': video.publish_date,
                        'description': description,
                        'keywords': video.keywords,
                        'date': date,
                        'date_src': date_src,
                        'transcription_filename': transcription_filename}

            df = pd.concat([df, pd.DataFrame.from_records([new_row])])

        count += 1
        if count % 10 == 0:
            print("DONE", count)
            # time.sleep(4)
            df.to_csv(csv_filename, index=False)
    df.to_csv(csv_filename, index=False)

In [64]:
## GCP pricing calculator: https://cloud.google.com/products/calculator
def estimate_playlist_transcription_cost(playlist_url):
    playlist = Playlist(playlist_url)
    print("Playlist url:", playlist_url)
    sum_fifteen_sec_intervals = 0
    count = 0
    sum_seconds_total = 0
    for video in playlist.videos:
        if video.length/60 < 5:
            continue
        count += 1
        sum_seconds_total += video.length
        sum_fifteen_sec_intervals += math.ceil(video.length/15)
    print('Total hours:', sum_seconds_total/3600)
    print('Number of videos in playlist:', count)

    print("Estimated cost:", sum_fifteen_sec_intervals * 0.006)
# estimate_playlist_transcription_cost("https://www.youtube.com/playlist?list=PLIoZEdpULeHk1BDS_U6XF-6yPgj3lNndB")
estimate_playlist_transcription_cost("https://www.youtube.com/playlist?list=PLl6P5vAyfWLx0unQuNRmywX7y_5L80eTQ")
estimate_playlist_transcription_cost("https://www.youtube.com/playlist?list=PLl6P5vAyfWLweGmJHzAXsyS7xhKR2RYgt")

Playlist url: https://www.youtube.com/playlist?list=PLl6P5vAyfWLx0unQuNRmywX7y_5L80eTQ
Total hours: 39.1225
Number of videos in playlist: 70
Estimated cost: 56.544000000000004
Playlist url: https://www.youtube.com/playlist?list=PLl6P5vAyfWLweGmJHzAXsyS7xhKR2RYgt
Total hours: 323.8594444444444
Number of videos in playlist: 619
Estimated cost: 468.11400000000003


In [104]:
playlist_url = 'https://www.youtube.com/playlist?list=PLl6P5vAyfWLweGmJHzAXsyS7xhKR2RYgt'
csv_filename = "akp_playlist_videos2.csv"
output_dir = "Videos2"
download_videos(playlist_url, output_dir)
get_playlist_metadata(playlist_url, csv_filename, output_dir)

Number of videos in playlist: 765
DONE 50
DONE 100
DONE 150
DONE 200
DONE 250
DONE 300
DONE 350
DONE 400
DONE 450
DONE 500
DONE 550
DONE 600
DONE 650
DONE 700
DONE 750
Number of videos in playlist: 765
DONE 10
DONE 20
DONE 30
DONE 40
DONE 50
DONE 60
DONE 70
DONE 80
DONE 90
DONE 100
DONE 110
DONE 120
DONE 130
DONE 140
DONE 150
DONE 160
DONE 170
DONE 180
DONE 190
DONE 200
DONE 210
DONE 220
DONE 230
DONE 240
DONE 250
DONE 260
DONE 270
DONE 280
DONE 290
DONE 300
DONE 310
DONE 320
DONE 330
DONE 340
DONE 350
DONE 360
DONE 370
DONE 380
DONE 390
DONE 400
DONE 410
DONE 420
DONE 430
DONE 440
DONE 450
DONE 460
DONE 470
DONE 480
DONE 490
DONE 500
DONE 510
DONE 520
DONE 530
DONE 540
DONE 550
DONE 560
DONE 570
DONE 580
DONE 590
DONE 600
DONE 610
DONE 620
DONE 630
DONE 640
DONE 650
DONE 660
DONE 670
DONE 680
DONE 690
DONE 700
DONE 710
DONE 720
DONE 730
DONE 740
DONE 750
DONE 760


In [109]:
playlist_url = 'https://www.youtube.com/playlist?list=PLl6P5vAyfWLx0unQuNRmywX7y_5L80eTQ'
csv_filename = "akp_playlist_videos3.csv"
output_dir = "Videos3"
download_videos(playlist_url, output_dir)
get_playlist_metadata(playlist_url, csv_filename, output_dir)

Number of videos in playlist: 85
DONE 10
DONE 20
DONE 30
DONE 40
DONE 50
ERROR CAUGHT
ERROR CAUGHT
DONE 60
ERROR CAUGHT
DONE 70
ERROR CAUGHT
DONE 80
ERROR CAUGHT
Number of videos in playlist: 85
DONE 50


In [221]:
playlist_url = "https://www.youtube.com/playlist?list=PLIoZEdpULeHkUAbidMGMJmquL5lQ3GyAJ"
csv_filename = "akp_playlist_videos4.csv"
output_dir = "Videos4"
download_videos(playlist_url, output_dir)
get_playlist_metadata(playlist_url, csv_filename, output_dir)

Number of videos in playlist: 682
DONE 10
DONE 20
DONE 30


In [218]:
playlist_url = "https://www.youtube.com/playlist?list=PLIoZEdpULeHk1BDS_U6XF-6yPgj3lNndB"
csv_filename = "akp_playlist_videos1.csv"
output_dir = "Videos1"
download_videos(playlist_url, output_dir)
get_playlist_metadata(playlist_url, csv_filename, output_dir)

Number of videos in playlist: 213
DONE 10
DONE 20
DONE 30
DONE 40
DONE 50
DONE 60
DONE 70
DONE 80
DONE 90
DONE 100
DONE 110
DONE 120
DONE 130
DONE 140
DONE 150
DONE 160
DONE 170
DONE 180
ERROR CAUGHT
ERROR CAUGHT
DONE 190
DONE 200
DONE 210
Number of videos in playlist: 213
DONE 10
DONE 20
DONE 30
DONE 40
DONE 50
DONE 60
DONE 70
DONE 80
DONE 90
DONE 100
DONE 110
DONE 120
DONE 130
DONE 140
DONE 150
DONE 160
DONE 170
DONE 180
DONE 190
DONE 200
DONE 210


In [212]:
df1 = pd.read_csv("akp_playlist_videos1.csv")
df2 = pd.read_csv("akp_playlist_videos2.csv")
df3 = pd.read_csv("akp_playlist_videos3.csv")
df4 = pd.read_csv("akp_playlist_videos4.csv")

df1 = df1[df1.title.str.contains("Erdoğan")]

df1["folder"] = "Videos1"
df2["folder"] = "Videos2"
df3["folder"] = "Videos3"
df4["folder"] = "Videos4"

new_df = pd.concat([df1, df2, df3, df4])

df_with_dates = pd.read_csv("../PresSpeeches/PresVideoLinks.csv")
list_of_videos = df_with_dates["title"].tolist()

for index, row in new_df.iterrows():
    if not isinstance(row["date"], str):
        if len(df_with_dates.loc[df_with_dates['title'] == row["title"]]) > 0:
            date_list = row["publish_date"][:10].split("-")
            year, month, day = date_list[0], date_list[1], date_list[2]
            published_date = day + "." + month + "." + year

            rows = df_with_dates.loc[df_with_dates['title'] == row["title"]]
            for i, r in rows.iterrows():
                # if r["date"] == published_date:
                #     df.at[index,'date']= r ["date"]
                #     df.at[index,'date_src'] = "tccb_videos"
                if r["date"][-7:] == published_date[-7:]:
                    new_df.at[index,'date'] = r["date"]
                    new_df.at[index,'date_src'] = "tccb_videos"

print(new_df["date"].isnull().sum())
print(len(new_df["date"]) - new_df["date"].isnull().sum())

new_df.to_csv("aggregated_videos.csv", index = False)
new_df["date_src"].value_counts(dropna=False)

705
956


NaN            705
tccb_videos    461
live_stream    420
title           47
description     28
Name: date_src, dtype: int64