# Simple example working with video data

In [1]:
import findspark
findspark.init()

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import udf, size, col
from pyspark.sql.types import (
    ArrayType,
    FloatType,
    IntegerType,
    LongType,
    StringType,
    StructField,
    StructType,
)

spark = (
    SparkSession
    .builder
    .appName('rikai-quickstart')
    .config('spark.jars.packages', 'ai.eto:rikai_2.12:0.0.1-SNAPSHOT')
    .master('local[*]')
    .getOrCreate()
)

## We'll work with a small set of music videos from The Hu

In [2]:
# Youtube video ids
the_hu = {
    'Sad But True': 'QpxA_ZxGX_M',
    'Sugaan Essena': 'YwutOqv4cGo',
    'The Great Chingghis Khaan': 'pD1gDSao1eA',
    'The Gereg': 'pu5jxc2pZtE',
    'Wolf Totem': 'jM8dCGIm6yc',
    'Yuve Yuve Yu': 'v4xZUr0BEfE'
}

In [3]:
from rikai.spark.types import YouTubeVideoType
from rikai.types import YouTubeVideo
import pandas as pd

pdf = (pd.DataFrame({'vid': the_hu})
      .reset_index()
      .rename(columns={'index':'name'}))

pdf['youtube_video'] = pdf.vid.apply(YouTubeVideo)

schema = StructType(
    [
        StructField("name", StringType(), False),
        StructField("vid", StringType(), False),        
        StructField("youtube_video", YouTubeVideoType(), False)
    ]
)

df = spark.createDataFrame(pdf, schema=schema)

### We can read/write Rikai format with native video types

In [4]:
df.coalesce(1).write.mode('overwrite').format('rikai').save('/tmp/thehu')

In [5]:
df = spark.read.format("rikai").load("/tmp/thehu/")
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- vid: string (nullable = true)
 |-- youtube_video: youTubeVideo (nullable = true)

+--------------------+-----------+--------------------+
|                name|        vid|       youtube_video|
+--------------------+-----------+--------------------+
|        Sad But True|QpxA_ZxGX_M|YouTubeVideo(vid=...|
|       Sugaan Essena|YwutOqv4cGo|YouTubeVideo(vid=...|
|           The Gereg|pu5jxc2pZtE|YouTubeVideo(vid=...|
|The Great Chinggh...|pD1gDSao1eA|YouTubeVideo(vid=...|
|          Wolf Totem|jM8dCGIm6yc|YouTubeVideo(vid=...|
|        Yuve Yuve Yu|v4xZUr0BEfE|YouTubeVideo(vid=...|
+--------------------+-----------+--------------------+



In [6]:
yt = df.first()['youtube_video']
yt

In [7]:
v = yt.get_stream()
v

## We make it easy to sample the frames for deep learning

In [8]:
from rikai.torch import DataLoader

data_loader = DataLoader(
    '/tmp/thehu',
    batch_size=1,
    shuffle=True
)
max_frames = 1 #
for batch in data_loader:
    yt = batch[0]['youtube_video']
    print(yt)
    for fno, frame in enumerate(yt.get_stream()):
        if fno >= max_frames:
            break
        print('fno: ', fno)
        print('frame: ', frame)

2021-01-27 20:03:23,510 INFO Rikai (dataset.py:108): Loading parquet files: ['file:///tmp/thehu/part-00000-26b28f78-0cfb-43ae-be9d-0c97fe72878c-c000.snappy.parquet']


YouTubeVideo(v4xZUr0BEfE)
fno:  0
frame:  [[[1 1 1]
  [1 1 1]
  [1 1 1]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[1 1 1]
  [1 1 1]
  [1 1 1]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[1 1 1]
  [1 1 1]
  [1 1 1]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]]
YouTubeVideo(jM8dCGIm6yc)
fno:  0
frame:  [[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]]
YouTubeVideo(pD1gDSao1eA)
fno:  0
frame:  [[[1 1 1]
  [1 1 1]
  [1 1 1]
  ...
  [0 0 0]
