In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath('../../src/scraping'))
import youtube_requesting as ytr

In [2]:
import numpy as np
import pandas as pd
import json
import scipy.stats
import time

### Desired functions:
- Given a video_id, get the surrounding videos and z-score that video in comparison.
- Get the basic video statistics surrounding a video
    - Likes
    - Dislikes
    - Views
    - Subscriber count
    - Video length(?)
- Combination of all stats to form a "success score"

### Note on api costs
Calculate those costs here:
https://developers.google.com/youtube/v3/determine_quota_cost

In [3]:
# Parameters

# When calculating z-score of a video, grab the +- x videos surrounding the target video.
nearest_videos = 5

In [4]:
# Testing vars
test_video_id = 'I5dK-pBipdU'

scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
with open('../../api_key.json') as json_file:
    cred = json.load(json_file)
api_key = cred['api_key']
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"

cur_vid = ytr.request_video_details(test_video_id, api_key, api_service_name, api_version)['items'][0]
metadata = ytr.get_vid_stats(cur_vid)

In [5]:
metadata

{'channel_id': 'UCrkfdiZ4pF3f5waQaJtjXew',
 'channel_title': 'GamingWithKev',
 'thumbnail_link': 'https://i.ytimg.com/vi/I5dK-pBipdU/maxresdefault.jpg',
 'title': 'Playing as the KILLER in Dead By Roblox!',
 'date': datetime.datetime(2020, 4, 4, 0, 56, 2, tzinfo=tzutc()),
 'views': 328277,
 'likes': 10780,
 'dislikes': 263,
 'comments': 1039}

## Possible success metrics
Available measurements:
- date
- views (COUNT)
- likes (COUNT)
- dislikes (COUNT)
- comments (COUNT)

Metadata recommendations from here (page 6-7):
https://www.sciencedirect.com/science/article/pii/S187705091731757X/pdf?md5=5961456d3adc0f65e03cff17279317d2&pid=1-s2.0-S187705091731757X-main.pdf

Get a number of videos from the parent channel closest to the selected video in date and zscore the available measurements, then average those measurements.

Weight views higher?

Make another function that can take in an input of a list of video ids

## Statistics
#### (WIP)
Add in a measurement for video time length

Expecting a new input

`
{'video_id': '4uf2x-O_pBw',
 'position': '0',
 'channel_videos': ['4uf2x-O_pBw', '8cu4qAoeXow', 'TZK-sk5Yvfw'],
 'channel_id':str}
`

In [7]:
def get_channel_vids_stats(videos):
    stats = []
    for v in videos:
        curr_vid = ytr.request_video_details(v,
                                             api_key,
                                             api_service_name,
                                             api_version)['items'][0]
        stats.append(ytr.get_vid_stats(curr_vid))
        time.sleep(1)
    
    return pd.DataFrame(stats)

In [8]:
def video_success(in_dict):
    vid_stats = get_channel_vids_stats(in_dict["channel_videos"])
    
    # A lower number position means the video is more popular.
    position_weight = 1.5
    position_score = (1 - (in_dict["position"] / len(in_dict["channel_videos"]))) * position_weight
    
    zscores = vid_stats[["comments", "dislikes", "likes", "views"]].apply(scipy.stats.zscore)
    
    return position_score + zscores.loc[in_dict["position"]].sum()

In [12]:
def video_success(video_id, channel_id):
    pass

## Metadata creation

In [6]:
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
with open(ROOT_DIR + "api_key.json") as json_file:
    cred = json.load(json_file)
api_key = cred['api_key']
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"

In [76]:
import json
ROOT_DIR = "../../"
with open(ROOT_DIR + "config/" "config-scraping.json") as f:
    cfg = json.load(f)

data = {}
fp = ROOT_DIR + cfg["videos-dir"] + cfg["selected-game"] + '/'
for fname in os.listdir(fp):
    with open(fp + fname) as f:
        read_data = json.load(f)
    data[read_data["date_scraped"]] = read_data["data"]

In [122]:
def get_vid_metadata(video_id):
    mdata = ytr.request_video_details(video_id, api_key, api_service_name, api_version)['items'][0]
    out = mdata["statistics"]
    
    unwanted_keys = ['liveBroadcastContent', 'localized', 'defaultAudioLanguage']
    for key in unwanted_keys:
        if key in mdata["snippet"]:
            mdata["snippet"].pop(key)
    
    out.update(mdata["snippet"])
    out["video_id"] = video_id
    return pd.Series(out)

In [137]:
def make_mdata_df(data):
    mdata = []
    for vid in data:
        mdata.append(vid["video_id"])
        
    df = pd.Series(mdata).apply(get_metadata)
    return df.set_index("video_id", drop=True)

In [138]:
df = make_mdata_df(data["04_16_20"])

In [143]:
game = cfg["selected-game"]
df.to_csv(ROOT_DIR + cfg["videos-dir"] + game + '/{}_metadata.csv'.format(game))

## Downloading thumbnails

In [127]:
import requests

In [128]:
def download_vid_thumb(video_id, df, save_dir, res="default"):
    url = df.loc[video_id]["thumbnails"][res]["url"]
    with open(save_dir + video_id + ".jpg", 'wb') as f:
        f.write(requests.get(url).content)

In [129]:
def download_df_thumbs(df, save_dir, res="default"):
    save_dir += res + '/'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    for v_id in df.index:
        download_vid_thumb(v_id, df, save_dir, res)

In [139]:
download_df_thumbs(df, ROOT_DIR + cfg["thumbnails-dir"] + cfg["selected-game"] + '/', res="medium")