# Simple example working with video data

In [1]:
import findspark
findspark.init()

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import udf, size, col
from pyspark.sql.types import (
    ArrayType,
    FloatType,
    IntegerType,
    LongType,
    StringType,
    StructField,
    StructType,
)

spark = (
    SparkSession
    .builder
    .appName("youtube-video-to-frames")
    .config("spark.executor.memory", "32g")
    .config("spark.driver.memory", "32g")
    .config("spark.sql.execution.arrow.enabled", "true")
    .config("spark.jars", "../target/scala-2.12/rikai_2.12-0.0.1.jar")
    .master("local[*]")
    .getOrCreate()
)

## We'll work with a small set of music videos from The Hu

In [2]:
# Youtube video ids
the_hu = {
    'Sad But True': 'QpxA_ZxGX_M',
    'Sugaan Essena': 'YwutOqv4cGo',
    'The Great Chingghis Khaan': 'pD1gDSao1eA',
    'The Gereg': 'pu5jxc2pZtE',
    'Wolf Totem': 'jM8dCGIm6yc',
    'Yuve Yuve Yu': 'v4xZUr0BEfE'
}

In [3]:
from rikai.spark.types import YouTubeVideoType
from rikai.types import YouTubeVideo
import pandas as pd

pdf = (pd.DataFrame({'vid': the_hu})
      .reset_index()
      .rename(columns={'index':'name'}))

pdf['youtube_video'] = pdf.vid.apply(YouTubeVideo)

schema = StructType(
    [
        StructField("name", StringType(), False),
        StructField("vid", StringType(), False),        
        StructField("youtube_video", YouTubeVideoType(), False)
    ]
)

df = spark.createDataFrame(pdf, schema=schema)

  Unsupported type in conversion to Arrow: YouTubeVideoType
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


### We can read/write Rikai format with native video types

In [4]:
df.coalesce(1).write.mode('overwrite').format('rikai').save('/tmp/thehu')

In [5]:
df = spark.read.format("rikai").load("/tmp/thehu")
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- vid: string (nullable = true)
 |-- youtube_video: youtubevideo (nullable = true)

+--------------------+-----------+--------------------+
|                name|        vid|       youtube_video|
+--------------------+-----------+--------------------+
|        Sad But True|QpxA_ZxGX_M|YouTubeVideo(vid=...|
|       Sugaan Essena|YwutOqv4cGo|YouTubeVideo(vid=...|
|           The Gereg|pu5jxc2pZtE|YouTubeVideo(vid=...|
|The Great Chinggh...|pD1gDSao1eA|YouTubeVideo(vid=...|
|          Wolf Totem|jM8dCGIm6yc|YouTubeVideo(vid=...|
|        Yuve Yuve Yu|v4xZUr0BEfE|YouTubeVideo(vid=...|
+--------------------+-----------+--------------------+



In [6]:
yt = df.first()['youtube_video']
yt

In [7]:
v = yt.get_stream()
v

## We make it easy to sample the frames for deep learning

In [8]:
from rikai.torch import DataLoader, make_video_sampler
from rikai.types.video import SingleFrameGenerator

sampler = make_video_sampler(SingleFrameGenerator(max_samples=2))

data_loader = DataLoader(
    '/tmp/thehu',
    batch_size=1,
    shuffle=True,
    transform_fn=sampler
)
for example in data_loader:
    print(example)

2021-01-14 21:07:52,465 INFO Rikai (dataset.py:77): Loading parquet files: ['file:///tmp/thehu/part-00000-668358ff-4924-4ad7-b12f-8516d242cc24-c000.snappy.parquet']


[{'name': 'Sugaan Essena', 'vid': 'YwutOqv4cGo', 'youtube_video': YouTubeVideo(YwutOqv4cGo), 'fno': 0, 'frame': array([[[ 43,  26,  20],
        [ 45,  28,  22],
        [ 47,  30,  24],
        ...,
        [ 13,   3,   3],
        [ 13,   3,   3],
        [ 13,   3,   3]],

       [[ 46,  29,  23],
        [ 47,  30,  24],
        [ 47,  30,  24],
        ...,
        [ 13,   3,   3],
        [ 13,   3,   3],
        [ 13,   3,   3]],

       [[ 52,  35,  29],
        [ 52,  35,  29],
        [ 50,  33,  27],
        ...,
        [ 13,   3,   3],
        [ 13,   3,   3],
        [ 13,   3,   3]],

       ...,

       [[220, 212, 214],
        [220, 212, 214],
        [220, 212, 214],
        ...,
        [ 93,  75,  67],
        [ 96,  78,  70],
        [ 93,  75,  67]],

       [[220, 212, 214],
        [220, 212, 214],
        [220, 212, 214],
        ...,
        [102,  82,  72],
        [100,  80,  70],
        [100,  80,  70]],

       [[220, 212, 214],
        [220, 212, 214],
