As a brief background:

I want to gather scores for a subset of TikTok data to compare the quality of our models - Appen managed services - Our moderator team

On meta, this is an easy task, as there is infrastructure built in our resolvers to do this.

This does not exist for TikTok.

I created a protocol that could possibly facilitate the scoring of arbitrary TikTok content.

The problem?

*There are no nodes that do frame extraction/thumbnails for TikTok videos*
This is a notebook to bridge that gap.

Input: TikTok content with Metadata/videos downloaded
Output: TikTok content with frames extracted and "content_type" set to "VIDEO"


In [5]:
import itertools
from qa_frame.models.avro.schemas.video import Content, Video
from requests import RequestException

from base import BaseScript

In [6]:
from qa_frame.models.environment import EnvConfig

CONFIG = EnvConfig.get()
RUN_ID = 1675808881

In [26]:
import snowflake.connector
from qa_frame.interfaces.db.snowflake import SnowflakeDriver


class InferenceCompareReader(SnowflakeDriver):

    def read_content_from_run_id(self, run_id: int | str) -> list[Content]:
        """
        Reads content from the ``QA_FRAME_INFERENCE_COMPARE`` table for a certain run_id
        Parameters
        ----------
        run_id : int | str
            ID of the inference compare run to read content for

        Returns
        -------
        contents : list[Content]
            Content objects read
        """
        return [Content(**row["OUTPUT_DATA"]) for row in self._read(
            """
            SELECT OUTPUT_DATA
            FROM STAGING.QA_FRAME_INFERENCE_COMPARE
            WHERE RUN_ID = '%s'
            """, run_id
        )]

reader = InferenceCompareReader(snowflake.connector.connect(**CONFIG.interface.db.snowflake.__dict__), CONFIG.interface.db.snowflake.environment)

In [21]:
print([c.content_id for c in reader.read_content_from_run_id(RUN_ID)])

[]


In [8]:
"""
Gathers TikTok video IDs that need to be scored
"""
from qa_frame.interfaces.external.s3 import S3Client
from qa_frame.consts.providers.s3 import SamplePostsCSV


s3 = S3Client(CONFIG.external.s3_client)
tiktok_video_ids = s3.read_csv(
    url=str(SamplePostsCSV.AMS_HPQ_TT_DATA.value),
    filter_=lambda csv_: (
        Video(**{"content_id": row["content_id"][1:-1]})
        for row in csv_
    )
)  # Formatted as video.Video objects


In [17]:
"""
Gathers metadata for our video IDs

Must port-forward the TikTok metadata service
"""
from qa_frame.interfaces.api.clients.seldon import SeldonClient


class GatherTiktokMetadataForVideos(BaseScript):

    name = "gather_tiktok_metadata"

    def __init__(self):
        super().__init__()
        self.client = SeldonClient[Video, Video](base_url="http://localhost:8080/api/v1.0")

    def run(self):
        batch_size = 50
        scored_content_count = 0

        while batch := list(itertools.islice(tiktok_video_ids, batch_size)):
            predictions, retry = [], True
            while retry:
                try:
                    predictions = [v for v in self.client.predictions(batch) if v.metadata_fetch_status == 200]
                    scored_content_count += len(predictions)
                    retry = False
                except RequestException:
                    continue

            self._write_json_files("tiktok_videos_with_metadata", *predictions)
            self.logger.info("Predictions received", total_count=scored_content_count)

GatherTiktokMetadataForVideos().run()

{[37m[39;49;00m
[37m  [39;49;00m[94m"asctime"[39;49;00m:[37m [39;49;00m[33m"2023-02-07 16:15:05"[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"levelname"[39;49;00m:[37m [39;49;00m[33m"INFO"[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"lineno"[39;49;00m:[37m [39;49;00m[34m54[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"msg"[39;49;00m:[37m [39;49;00m[33m"Initialized output directory"[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"output_path"[39;49;00m:[37m [39;49;00m[33m"/Users/ryan.demarigny/PycharmProjects/qa-frame-notebooks/notebooks/output/gather_tiktok_metadata"[39;49;00m[37m[39;49;00m
}[37m[39;49;00m

{[37m[39;49;00m
[37m  [39;49;00m[94m"asctime"[39;49;00m:[37m [39;49;00m[33m"2023-02-07 16:15:07"[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"levelname"[39;49;00m:[37m [39;49;00m[33m"INFO"[39;49;00m,[37m[39;49;00m
[37m  [39;49;00m[94m"lineno"[39;49;00m:[37m [39;49;00m[34m114[39;49;00m,[37m[39;49;0