In [1]:
from warnings import simplefilter
simplefilter('ignore')
from extract_video_info import extract_by_id
from api_utils import linear_pop_metric
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from PIL import Image

tqdm.pandas()

In [4]:
# resizing image to (256, 256) as this is the max size we will use. 
# converting now to save space but recording origional size as features
def resize_image(image_path:str, new_size:tuple):
    try:
        image = Image.open(image_path)
        resized_image = image.resize(new_size)
    except (FileNotFoundError, OSError):
        return np.nan, (np.nan, np.nan)
        
    image_name, ext = os.path.splitext(image_path)
    image_name = image_name + '.jpg'
    if resized_image.mode in ("RGBA", "P"): 
        resized_image = resized_image.convert("RGB")
    resized_image.save(image_name)
    size = image.size
    image.close()
    if ext != '.jpg':
        os.remove(image_path)
    return os.path.split(image_name)[1], size


def resize_images(image_name:str, image_folder:str, new_size:tuple=(256, 256)):
    image_path = os.path.join(image_folder, image_name)
    name, (width, height) = resize_image(image_path, new_size)
    return pd.Series({'thumb_name': name, 'thumb_width': width, 'thumb_height': height})


# takes a dataframe like that extracted by api_utils.py and adds features from youtube-dl tool including thumbnail image
# will save over input file after each batch if save_after_each_batch=True, 
# otherwise it will not save the result and only return the result
def continue_extraction(df_path:str, thumb_folder:str, batch_size:int=1000, batches:int=5, save_after_each_batch:bool=True):

    df = pd.read_json(df_path)


    extract_list = ['description', 'duration', 'age_limit', 
                    'categories', 'tags', 'is_live',
                    'width', 'height', 'fps', 'vcodec', 'vbr', 
                    'acodec', 'abr', 'thumb_name', 'subtitles']
    
    thumb_cols = ['thumb_width', 'thumb_height']
    if 'thumb_width' not in df.columns:
        df.loc[:, thumb_cols] = np.nan
    if 'subtitles' not in df.columns:
        df.loc[:, extract_list] = np.nan

    for batch in range(1, batches + 1):

        indexes = df[((df['subtitles'].isnull()) & (df['thumb_name'].isnull()))].index[:batch_size]
        print('getting data')
        df.loc[indexes, extract_list] = df.loc[indexes, 'vid_id'].progress_apply(lambda x: extract_by_id(x, thumb_folder))
        print('resizing images')
        df.loc[indexes, ['thumb_name', 'thumb_width', 'thumb_height']] = df.loc[indexes, 'thumb_name'].progress_apply(lambda x: resize_images(x, thumb_folder))
        
        if save_after_each_batch:
            print(f"After batch: {batch} of {batches} Shape of extracted: {df[~df['subtitles'].isnull()].shape} Shape of unextracted: {df[df['subtitles'].isnull()].shape}")
            df.to_json(df_path)
    
    return df

In [5]:
THUMB_FOLDER = ''

df_path = ''


In [6]:
df = continue_extraction(df_path, THUMB_FOLDER, 5000, 10)

In [7]:
df.sample(5)

In [8]:
# merges two dataframes. used for instances where there is an updated api_utils.py dataframe or the collection was running on multiple systems
# in the instance where there is an updated api_utils.py dataframe it should be inputted as new_df,
# order does not matter in the other situatuion

def merge_dfs(orig_df, new_df, final_path):
    df1 = pd.read_json(orig_df)
    df1 = df1.drop_duplicates(subset='vid_id')

    df2 = pd.read_json(new_df)
    df2 = df2.drop_duplicates(subset='vid_id')

    df1.set_index('vid_id', inplace=True, drop=True)
    df2.set_index('vid_id', inplace=True, drop=True)



    

    extract_list = ['description', 'duration', 'age_limit', 
                    'categories', 'tags', 'is_live',
                    'width', 'height', 'fps', 'vcodec', 'vbr', 
                    'acodec', 'abr', 'thumb_name', 'subtitles',
                    'thumb_width', 'thumb_height']

    if 'subtitles' not in df2.columns:
        df2.loc[:, extract_list] = np.nan

    print(df1[~df1['subtitles'].isnull()].shape)
    print(df2[~df2['subtitles'].isnull()].shape)
   
    df2 = df2.fillna(df1).reset_index()

    
    print(df1[~df1['subtitles'].isnull()].shape)
    print(df2[~df2['subtitles'].isnull()].shape)
    df2.to_json(final_path)
    return df1, df2

In [7]:
df1, df2 = merge_dfs('', '', '')

(471329, 33)
(441329, 33)
(471329, 33)
(546829, 34)


In [2]:
df2 = pd.read_json('')

In [18]:
from tqdm import tqdm_notebook
from shutil import copyfile
import os
from pathlib import Path

# this is used to clean the dataframe, i.e, remove nulls and ---missing--- tag in the subtitles and thumb_name columns
# then applies the linear_pop_metric. it then shuffles the data and splits it. one set for both neural net models and 
# another for the ensemble models
def data_clean_and_split(df, dest_folder, nn_ens_split:float=.8, save_data:bool=True):


    df_ = df.copy()

    print(df_.shape)
    print('applying metric')
    df_ = df.dropna(subset=['vid_likecount', 'vid_viewcount', 'vid_commentcount'])
    df_ = df_.merge(linear_pop_metric(df_)[['vid_id', 'pop_metric']], on='vid_id')
    df_ = df_.dropna(subset=['pop_metric'])
    df_ = df_.dropna(subset=['subtitles', 'thumb_name'], how='all')
    print('before split')
    print(df_.shape)
    df_ = df_.sample(df_.shape[0])

    df_nn = df_.iloc[:int(nn_ens_split * df_.shape[0]), :]
    df_ensemble = df_.iloc[int(nn_ens_split * df_.shape[0]):, :]
    print('after split')
    print(f'NN models data.shape {df_nn.shape}, ensemble model data.shape {df_ensemble.shape}')

    drop_index_sub = df_nn[df_nn['subtitles'] == '---missing---'].index
    drop_index_image = df_nn[df_nn['thumb_name'] == '---missing---'].index
    drop_index_ensemble = df_ensemble[((df_ensemble['thumb_name'] == '---missing---') | (df_ensemble['subtitles'] == '---missing---'))].index
    
    df_cnn = df_nn.drop(drop_index_image, axis=0)
    df_cnn.dropna(subset='thumb_name', inplace=True, axis=0)
    df_rnn = df_nn.drop(drop_index_sub, axis=0)
    df_rnn.dropna(subset='subtitles', inplace=True, axis=0)
    df_ensemble = df_ensemble.drop(drop_index_ensemble, axis=0)
    df_ensemble.dropna(subset=['thumb_name', 'subtitles'], inplace=True, axis=0)
    
    print('after drops')
    print(df_cnn.shape, df_rnn.shape, df_ensemble.shape)

    if save_data:
        base_dest_path = Path(dest_folder)
        df_cnn.to_json(str(base_dest_path / 'cnn_data.json.gz'))
        df_rnn.to_json(str(base_dest_path / 'rnn_data.json.gz'))
        df_ensemble.to_json(str(base_dest_path / 'ens_data.json.gz'))

    return df_cnn, df_rnn, df_ensemble


def split_images(nn_json_name:str, ens_json_name:str, source_folder:str, dest_folder:str):

    source_path = Path(source_folder)
    base_dest_path = Path(dest_folder)
    conflict = base_dest_path / 'conflict'

    nn_json_path = base_dest_path / nn_json_name
    ens_json_path = base_dest_path / ens_json_name

    nn_images_path = base_dest_path / 'nn_data' / 'images'

    ens_images_path = base_dest_path / 'ens_train' / 'images'

    paths = [nn_images_path, ens_images_path, conflict]

    for p in paths:
        p.mkdir(parents=True, exist_ok=True)

    
    
    for dest, dataset_path in zip(paths[:2], [nn_json_path, ens_json_path]):
        
        dataset = pd.read_json(dataset_path)
        counter = 0
        files = dataset['thumb_name'].values
        for file in tqdm_notebook(files):
            try:
                src = source_path / file
                des = dest / file
                copyfile(src, des)
            except FileNotFoundError:
                dataset.loc[dataset['thumb_name'] == file, ['thumb_name']] = np.nan

                counter += 1

        if dataset['thumb_name'].isnull().sum() > 0:
            print(f'discrepancy found {counter} missing files. saving new data')
            dataset = dataset.dropna(subset='thumb_name')
            dataset.to_json(conflict / ('-' + dataset_path.name))
                

In [13]:
df_cnn, df_rnn, df_ensemble = data_clean_and_split(df2, '')

(1120959, 34)
applying metric
before split
(554177, 35)
after split
NN models data.shape (443341, 35), ensemble model data.shape (110836, 35)
(428023, 35) (293475, 35) (71233, 35)


In [14]:
df_ensemble.isnull().sum()

vid_id                  0
chan_query              0
chan_id                 0
chan_name               0
chan_viewcount          0
chan_subcount           0
chan_start_dt           0
chan_thumb              0
chan_vidcount           0
vid_name                0
vid_publish_dt          0
vid_thumb               0
vid_duration            0
vid_caption             0
vid_viewcount           0
vid_likecount           0
vid_commentcount        0
description          1349
duration                0
age_limit               0
categories              0
tags                    0
is_live             71233
width                   1
height                  1
fps                     1
vcodec                  0
vbr                 10636
acodec                  0
abr                    27
thumb_name              0
subtitles               0
thumb_width             0
thumb_height            0
pop_metric              0
dtype: int64

In [15]:
df_cnn.isnull().sum()

vid_id                   0
chan_query               0
chan_id                  0
chan_name                0
chan_viewcount           0
chan_subcount            0
chan_start_dt            0
chan_thumb               0
chan_vidcount            0
vid_name                 0
vid_publish_dt           0
vid_thumb                0
vid_duration             0
vid_caption              0
vid_viewcount            0
vid_likecount            0
vid_commentcount         0
description           9715
duration                 0
age_limit                0
categories               0
tags                     0
is_live             428023
width                    0
height                   0
fps                      0
vcodec                   0
vbr                  62352
acodec                   0
abr                    163
thumb_name               0
subtitles             8612
thumb_width              0
thumb_height             0
pop_metric               0
dtype: int64

In [16]:
df_rnn.isnull().sum()

vid_id                   0
chan_query               0
chan_id                  0
chan_name                0
chan_viewcount           0
chan_subcount            0
chan_start_dt            0
chan_thumb               0
chan_vidcount            0
vid_name                 0
vid_publish_dt           0
vid_thumb                0
vid_duration             0
vid_caption              0
vid_viewcount            0
vid_likecount            0
vid_commentcount         0
description           5433
duration                 0
age_limit                0
categories               0
tags                     0
is_live             293475
width                    0
height                   0
fps                      0
vcodec                   0
vbr                  43059
acodec                   0
abr                     84
thumb_name            8232
subtitles                0
thumb_width           8232
thumb_height          8232
pop_metric               0
dtype: int64

In [19]:
split_images('', '', '', '')

  0%|          | 0/428023 [00:00<?, ?it/s]

  0%|          | 0/71233 [00:00<?, ?it/s]