# Get YouTube Recommendation Videos from a Seed Video

## Setup Code

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from random import choices, sample, seed, randrange
from datetime import datetime
import re

In [None]:
df = pd.DataFrame(columns=['seed_title', 'seed_url', 'seed_creator', 'seed_viewcount',
       'seed_upload_date', 'rec_title', 'rec_url', 'rec_creator',
       'rec_viewcount', 'rec_upload_date'])
df

In [None]:
def get_title(span):
    return span['title']

def get_url(span):
    return 'https://www.youtube.com' + span.find_parent('a', "yt-simple-endpoint")['href']

def get_creator(span):
    return span.find_parent('a', "yt-simple-endpoint").find('yt-formatted-string').get_text()

def get_viewcount(span):
    try:
        return span.find_parent('a', "yt-simple-endpoint").find('div', id='metadata-line').find_all('span')[0].get_text()[:-6]
    except:
        return np.nan

def get_uploaddate(span):
    try:
        return span.find_parent('a', "yt-simple-endpoint").find('div', id='metadata-line').find_all('span')[1].get_text()
    except:
        return np.nan

def get_seed_title(soup):
    return soup.find('h1', 'title').get_text()

def get_seed_creator(soup):
    return soup.find('ytd-channel-name').find('yt-formatted-string').get_text()

def get_seed_viewcount(soup):
    return re.search('[\d\,]+',soup.find('span', 'view-count').get_text())[0]
    
def get_seed_uploaddate(soup):
    return soup.find('div', id='date').find('yt-formatted-string').get_text()

In [None]:
counter = 0
seed()
def get_vid_recs(seed_url):
    global counter
    driver.get(seed_url)
    time.sleep(randrange(20, 30))
    soup = BeautifulSoup(driver.page_source)
    seed_title = get_seed_title(soup)
    seed_creator = get_seed_creator(soup)
    seed_viewcount = get_seed_viewcount(soup)
    seed_upload_date = get_seed_uploaddate(soup)
    rec_spans = soup.find_all('span', id='video-title')[:20]
    chosen = sample(rec_spans, k=5)
    for sp in chosen:
        info = {
            "seed_title": seed_title, 
            "seed_url": seed_url, 
            "seed_creator": seed_creator, 
            "seed_viewcount": seed_viewcount,
            "seed_upload_date": seed_upload_date,
            "rec_title": get_title(sp), 
            "rec_url": get_url(sp), 
            "rec_creator": get_creator(sp),
            'rec_viewcount': get_viewcount(sp), 
            'rec_upload_date': get_uploaddate(sp)
        }
        print(info)
        df.loc[counter] = info
        counter += 1
    rec_url = get_url(choices(chosen)[0])
    print("rec_url", rec_url)
    if counter < 100:
        get_vid_recs(rec_url)

## Start Here

In [None]:
driver = webdriver.Chrome()
get_vid_recs('https://www.youtube.com/watch?v=ABTdTTnnEU8') # put seed video here

In [None]:
df

In [None]:
df.to_csv(f"run{datetime.now().strftime("%m-%d-%Y_%H-%M-%S")}.csv")

In [None]:
counter = 55

In [None]:
counter

In [None]:
soup = BeautifulSoup(driver.page_source)

In [None]:
rec_spans = soup.find_all('span', id='video-title')

In [None]:
rec = rec_spans[5]

In [None]:
rec

In [None]:
rec.find_parent('a', "yt-simple-endpoint").find('div', id='metadata-line').find_all('span')[0].get_text()[:-6]

In [None]:
rec.find_parent('a', "yt-simple-endpoint").find('div', id='metadata-line').find_all('span')[1].get_text()

## Clean up data (Ignore if you ran above code)

In [None]:
df = pd.read_csv('run2.csv').iloc[:, 1:]
df.head()

In [None]:
df.head(20)

In [None]:
driver = webdriver.Chrome()

In [None]:
soup = BeautifulSoup(driver.page_source)

In [None]:
get_seed_uploaddate(soup)

In [None]:
get_seed_viewcount(soup)

In [None]:
df.insert(3, 'seed_viewcount', np.nan)

In [None]:
df.insert(4, 'seed_upload_date', np.nan)

In [None]:
df.head()

In [None]:
df.insert(8, 'rec_viewcount', np.nan)

In [None]:
df.insert(9, 'rec_upload_date', np.nan)

In [None]:
df.head()

In [None]:
info_dict = {}

In [None]:
for url in df['rec_url'].unique():
    print(url)
    if url not in info_dict:
        driver.get(url)
        time.sleep(randrange(5, 12))
        soup = BeautifulSoup(driver.page_source)
        info = {'viewcount': get_seed_viewcount(soup), 'upload_date': get_seed_uploaddate(soup)}
        info_dict[url] = info
        print(info)

In [None]:
df.head()

In [None]:
df['seed_url'].apply(lambda u: info_dict[u]['viewcount'])

In [None]:
df['seed_viewcount'] = df['seed_url'].apply(lambda u: info_dict[u]['viewcount'])

In [None]:
df['seed_upload_date'] = df['seed_url'].apply(lambda u: info_dict[u]['upload_date'])

In [None]:
df['rec_viewcount'] = df['rec_url'].apply(lambda u: info_dict[u]['viewcount'])

In [None]:
df['rec_upload_date'] = df['rec_url'].apply(lambda u: info_dict[u]['upload_date'])

In [None]:
df.to_csv('run2_updated.csv')

In [None]:
df.columns