# Compile video results

For deep learning paper, March 28, 2023



In [2]:
from pathlib import Path
import pyprojroot
dir_proj = pyprojroot.here()
print("Project directory:", dir_proj)

dir_videos = Path("/vape/collection/appended_scrape_download/TikTok/Influencers/tot_coding")
print("Video directory:", dir_videos)

import pandas as pd
import numpy as np

file_video_list = "../analytic_sample_17361_20192022.csv"
video_df = pd.read_csv(dir_videos / file_video_list)
video_df.info()

dir_output = pyprojroot.here() / "data/detections/" / Path(file_video_list).stem
print("Analysis output dir:", dir_output)

Project directory: /home/ck37/projects/ecig-vaping
Video directory: /vape/collection/appended_scrape_download/TikTok/Influencers/tot_coding
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         17361 non-null  int64 
 1   infl_username      17361 non-null  object
 2   number             17361 non-null  int64 
 3   video_path         17361 non-null  object
 4   video_shortcode    17361 non-null  object
 5   year               17361 non-null  int64 
 6   year_recent        17361 non-null  int64 
 7   date_tot           17361 non-null  object
 8   n_videos_per_infl  17361 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 1.2+ MB
Analysis output dir: /home/ck37/projects/ecig-vaping/data/detections/analytic_sample_17361_20192022


In [3]:
# Check progress
# pkl = scored video rseult
# parquest = frames processed by this script
!ls -l {dir_output}/*.pkl | wc -l; ls -l {dir_output}/**.parquet | wc -l

14091
14072


In [5]:
# Extract all videos that have been scored
scored_videos = list(dir_output.glob("*.pkl"))
print("Scored videos found:", len(scored_videos))

Scored videos found: 14072


In [92]:
import pickle

def analyze_video_pkl(video_pkl):
    video_preds = pickle.load(open(video_pkl, 'rb'))
    video_name = video_pkl.stem
    # Each pickle contains the equivalent of the json - one element per frame, and each frame has a list of object detections for each class.
    
    n_frames =  len(video_preds)
    print("Frames:", n_frames)
    
    # Current class order (9):
    classes = ('box', 'e-cigarette brand name', 'e-juice', 'e-juice flavor', 'mod', 'pod', 'smoke cloud', 'synthetic nicotine label', 'warning label nicotine')
    
    # Define the columns that we want to track
    df_columns = (
                'video',
                'frame',
    ) + classes
    
    # Create a dataframe to store the frame counts for each object.
    #video_df = \
    #    pd.DataFrame(0,
    #                 index = np.arange(n_frames),
    #                 columns = df_columns)
    
    video_df = \
        pd.DataFrame(0.,
                     index = np.arange(n_frames),
                     columns = classes)

    #video_df['frame'] = np.arange(n_frames)
    #video_df['video'] = video_name

    
    # Loop over each frame and extract the highest predicted prob for each class.
    for frame_i, frame in enumerate(video_preds):
                
        # These are the object detections that we want to count
        #pred_types = ('mod',
        #              'pod',
        #              'e-juice',
        #              'box',
        #              'smoke cloud')
        pred_types = classes
            
        # Loop over each prediction type that we care about and keep the highest probability for that class on this frame.
        for pred_type_i in pred_types:
            
            df_pred_col_i = list(video_df.columns).index(pred_type_i)
            
            # Find the location of this object type in the model result object.
            pred_index = classes.index(pred_type_i)

            # Count the number of predictions for this object that exceed the probability threshold for detection.
            # Extract the probability prediction for any detected objects
            frame_probs = [pred_i[4] for pred_i in frame[pred_index]]

            if len(frame_probs) > 0:
                max_prob = np.max(frame_probs)
            else:
                max_prob = 0.
            #print(f"Updating: {frame_i}, {pred_type_i}, {max_prob}")
            #print(len(video_df[pred_type_i]))
            #print(video_df.loc[frame_i])
            #print(video_df.loc[frame_i])

            #video_df.loc[int(frame_i), pred_type_i] = max_prob
            video_df.loc[frame_i, pred_type_i] = max_prob
            #video_df.loc[frame_i, pred_type_i] = max_prob


            #video_df.at[frame_i, df_pred_col_i] = max_prob
    video_df['video'] = video_name
    video_df['frame'] = np.arange(n_frames)
    return(video_df)

In [None]:
video_dfs = []
for video_i, video_pkl in enumerate(scored_videos):
    print(f"{video_i}: {video_pkl.stem}")
    output_file = dir_output / (video_pkl.stem + ".parquet")
    print(output_file)
    if not output_file.exists():
        video_df = analyze_video_pkl(video_pkl)
        video_df.to_parquet(output_file)
    else:
        video_df = pd.read_parquet(output_file)
    video_dfs.append(video_df)
        
# Combine all predictions into a single dataframe.
pred_df = pd.concat(video_dfs)

pred_df.to_parquet(dir_proj / "data/unlabeled-video-frames.parquet")

In [100]:
# 10.2 million frames
pred_df.shape

(10276485, 11)

In [None]:
pred_df.shape