# OpenAI video testing

Conda env: gptwork

## Setup

In [2]:
from IPython.display import display, Image, Audio

import cv2  # We're using OpenCV to read video, to install !pip install opencv-python
import base64
import time
from openai import OpenAI
import os
import requests

In [3]:
client = OpenAI(api_key=os.environ.get("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",
                                       "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"))

## Functions

In [57]:
def load_video(video_path, verbose = True):
    video = cv2.VideoCapture(video_path)

    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
    
    video.release()
    if (verbose):
        print(len(base64Frames), "frames read.")

    return base64Frames

def query_video(frames, prompt,
                frame_sample = 100,
                max_frames = None,
                resize_pixels = 768,
                max_tokens = 200,
                model = "gpt-4o",
                temperature = 0,
                verbose = True):

    total_frames = len(frames)
    
    if max_frames is None:
        frame_slice = slice(0, total_frames, frame_sample)
    else:
        step_size = int(np.ceil(total_frames / max_frames))
        frame_slice = slice(0, total_frames, step_size)
    
    messages = [
        { # Prompt
            "role": "system",
            "content": prompt
        },
        { # Context in the form of image frames sampled from the video.
            "role": "user",
            "content": [
                *map(lambda x: {"image": x, "resize": resize_pixels}, frames[frame_slice]),
            ],
        },
    ]
    
    params = {
        "model": model,
        "temperature": temperature,
        "messages": messages,
        "max_tokens": max_tokens,
    }
    
    result = client.chat.completions.create(**params)
    if verbose:
        print(result.choices[0].message.content)
    return result

In [48]:
import numpy as np
total_frames = 60
max_frames = 4

frame_slice

slice(0, 60, 15)

## Prompt tests

In [51]:
# Load a video
frames = load_video("videos/vaping_modeling.mp4")

238 frames read.


In [63]:
# Run an Open API GPT query.
query_video(frames,
            ("These are frames from a video that I want to upload. Describe what is in this video. "+
            "Please also provide the numeric probability that there is vaping in each frame. Limit the description to three words per frame"),
           max_frames = 5);

Frame 1: Holding necklace - 0% probability of vaping
Frame 2: Exhaling smoke - 90% probability of vaping
Frame 3: Looking away - 0% probability of vaping
Frame 4: Walking forward - 0% probability of vaping
Frame 5: Walking forward - 0% probability of vaping


In [60]:
# Run an Open API GPT query.
query_video(frames,
            ("You are an expert in tobacco, e-cigarette, and vaping marketing. These are frames from a video for you to carefully evaluate. " +
             "Describe what is in this video using no more than 5 words per frame. "
             "In particular, note any products that are e-cigarettes, tobacco, mods, pods, e-juices, vaping, e-juice containers, or similar. " +
             "Also provide an accurate numeric probability in brackets that there is vaping in each frame. For example: Probability of vaping: [5%] "));

1. Holding a chain, outdoors. Probability of vaping: [5%]
2. Looking away, outdoors. Probability of vaping: [0%]
3. Walking, urban setting. Probability of vaping: [0%]


## Run on directory

In [61]:
from pathlib import Path
video_dir = Path("videos/GPT4_themes")
videos = list(video_dir.glob('**/*.mp4'))
print("Found", len(videos), "videos to analyze")

Found 102 videos to analyze


In [62]:
videos[:10]

[PosixPath('videos/GPT4_themes/fashion/fabio_fashion_3.mp4'),
 PosixPath('videos/GPT4_themes/fashion/chamillioneyes_fashion_1.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_2.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_18.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_19.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_14.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_4.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_17.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_8.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_9.mp4')]

In [24]:
prompt_noprobs = ("These are frames from a video that I want to upload. Describe what is in this video. "+
            "Also describe if there is nicotine vaping in this video. "+
            "Also describe if there is cannabis vaping in this video. "+
            "Also describe if there is fashion (only include modeling) in this video. "+
            "Also describe if there is entertainment (for example, dj, videogames)  in this video. "+
            "Also describe if there is active lifestyle (for example sport) in this video. "+
            "Limit the description to yes or no responses for each category. " + 
            "Also identify if the person in this video is between the age of 0-11 or 12-17 or 18-21"+ 
            "or 22-25 or 26-30 or 31-40 or 41-50 or 51-60 or older than 60 years old. " +
            "Format the results by surrounding each answer with dollar signs, like $Yes$ or $No$.")
prompt_noprobs

'These are frames from a video that I want to upload. Describe what is in this video. Also describe if there is nicotine vaping in this video. Also describe if there is cannabis vaping in this video. Also describe if there is fashion  in this video. Also describe if there is entertainment (for example, dj, videogames)  in this video. Also describe if there is technology in this video (do not include vaping). Also describe if there is active lifestyle (for example sport) in this video. Limit the description to yes or no responses for each category. Also identify if the person in this video is between the age of 0-11 or 12-17 or 18-21or 22-25 or 26-30 or 31-40 or 41-50 or 51-60 or older than 60 years old. Format the results by surrounding each answer with dollar signs, like $Yes$ or $No$.'

In [29]:
prompt_probs = ("These are frames from a video that I want to upload. Describe what is in this video. "+
            "Also provide the probability that there is nicotine vaping in this video. "+
            "Also provide the probability that there is cannabis vaping in this video. "+
            "Also provide the probability that there is fashion (only include modeling) in this video. "+
            "Also provide the probability that there is entertainment (for example, dj, videogames)  in this video. "+
            "Also provide the probability that there is active lifestyle (for example sport) in this video. "+
            "Limit the description to numeric probability responses for each category, like 100% or 10%. " + 
            "Also provide the probability that the person in this video is between the age of 0-11 or 12-17 or 18-21"+ 
            "or 22-25 or 26-30 or 31-40 or 41-50 or 51-60 or older than 60 years old. " +
            "Format the results by surrounding each answer with dollar signs, like $75%$ or $25%$.")
prompt_probs

'These are frames from a video that I want to upload. Describe what is in this video. Also provide the probability that there is nicotine vaping in this video. Also provide the probability that there is cannabis vaping in this video. Also provide the probability that there is fashion  in this video. Also provide the probability that there is entertainment (for example, dj, videogames)  in this video. Also provide the probability that there is technology in this video (do not include vaping). Also provide the probability that there is active lifestyle (for example sport) in this video. Limit the description to numeric probability responses for each category, like 100% or 10%. Also provide the probability that the person in this video is between the age of 0-11 or 12-17 or 18-21or 22-25 or 26-30 or 31-40 or 41-50 or 51-60 or older than 60 years old. Format the results by surrounding each answer with dollar signs, like $75%$ or $25%$.'

In [64]:
def analyze_videos(videos, prompt, max_frames = 4, verbose = False):
    results = {}
    for video in videos:
        video_name = video.stem
        if (verbose):
            print(f"\nAnalyzing {video_name}")
        # Import video
        frames = load_video(str(video))
        # Run query
        result = query_video(frames, prompt, verbose = verbose, max_frames = max_frames)
        # Save results
        results[video_name] = result
    return(results)

In [65]:
%%time

results_noprob = analyze_videos(videos, prompt_noprobs, verbose = True)


Analyzing fabio_fashion_3
406 frames read.
Nicotine vaping: $No$

Cannabis vaping: $No$

Fashion: $Yes$

Entertainment: $No$

Technology: $No$

Active lifestyle: $No$

Age: $26-30$

Analyzing chamillioneyes_fashion_1
386 frames read.
Nicotine vaping: $No$

Cannabis vaping: $No$

Fashion: $Yes$

Entertainment: $No$

Technology: $No$

Active lifestyle: $No$

Age of person: $22-25$

Analyzing fabio_fashion_2


KeyboardInterrupt: 

### Interpret results

In [247]:
# Create dataframe with each video and the label theme.
videos[:5]

[PosixPath('videos/GPT4_themes/fashion/fabio_fashion_3.mp4'),
 PosixPath('videos/GPT4_themes/fashion/chamillioneyes_fashion_1.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_2.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_18.mp4'),
 PosixPath('videos/GPT4_themes/fashion/fabio_fashion_19.mp4')]

In [73]:
import pandas as pd
df = pd.DataFrame(data = {'path': [str(video_i) for video_i in videos]})

In [76]:
df['theme'] = df.path.str.extract(r'(?<=GPT4_themes/)([^/]+)')

In [77]:
df

Unnamed: 0,path,theme
0,videos/GPT4_themes/fashion/fabio_fashion_3.mp4,fashion
1,videos/GPT4_themes/fashion/chamillioneyes_fash...,fashion
2,videos/GPT4_themes/fashion/fabio_fashion_2.mp4,fashion
3,videos/GPT4_themes/fashion/fabio_fashion_18.mp4,fashion
4,videos/GPT4_themes/fashion/fabio_fashion_19.mp4,fashion
...,...,...
97,videos/GPT4_themes/marijuana/calitrickzz_mj_14...,marijuana
98,videos/GPT4_themes/marijuana/calitrickzz_mj_19...,marijuana
99,videos/GPT4_themes/marijuana/calitrickzz_mj_4.mp4,marijuana
100,videos/GPT4_themes/marijuana/calitrickzz_mj_7.mp4,marijuana


In [78]:
df.theme.value_counts()

theme
ecigs         22
fashion       20
health        20
technology    20
marijuana     20
Name: count, dtype: int64

In [203]:
from itertools import chain

def extract_probs(lines):
    # This returns a two-dimensional list
    string_vals = [re.findall(r'\$(\d+)%\$', line) for line in lines]
    onedim_list = list(chain(*string_vals))
    list_ints = [int(i) for i in onedim_list]
    np_array = np.array(list_ints)
    probs = np_array / 100
    return(probs)

def find_prob_by_keyword(msg, keyword):
    prob = None
    target_lines = [line for line in msg.split('\n') if keyword in line]
    if len(target_lines) > 0:
        probs = extract_probs(target_lines)
        if len(probs) > 0:
            prob = np.amax(probs)
    return(prob)

def parse_response(msg):
    vaping_prob = find_prob_by_keyword(msg, 'Nicotine vaping')
    cannabis_prob = find_prob_by_keyword(msg, 'Cannabis vaping')
    fashion_prob = find_prob_by_keyword(msg, 'Fashion')
    entertainment_prob = find_prob_by_keyword(msg, 'Entertainment')
    lifestyle_prob = find_prob_by_keyword(msg, 'Active lifestyle')
    age_0_11 = find_prob_by_keyword(msg, '0-11')
    age_12_17 = find_prob_by_keyword(msg, '12-17')
    age_18_21 = find_prob_by_keyword(msg, '18-21')
    age_22_25 = find_prob_by_keyword(msg, '22-25')
    age_26_30 = find_prob_by_keyword(msg, '26-30')
    age_31_40 = find_prob_by_keyword(msg, '31-40')
    age_41_50 = find_prob_by_keyword(msg, '41-50')
    age_51_60 = find_prob_by_keyword(msg, '51-60')
    age_61_plus = find_prob_by_keyword(msg, '61+')
    
    compiled_probs = {
        'nicotine vaping': vaping_prob,
        'cannabis vaping': cannabis_prob,
        'fashion': fashion_prob,
        'entertainment': entertainment_prob,
        'lifestyle': lifestyle_prob,
        'age_0-11': age_0_11,
        'age_12-17': age_12_17,
        'age_18-21': age_18_21,
        'age_22-25': age_22_25,
        'age_26-30': age_26_30,
        'age_31-40': age_31_40,
        'age_41-50': age_41_50,
        'age_51-60': age_51_60,
        'age_61_plus': age_61_plus
    }

    df = pd.DataFrame(compiled_probs, index = [0])
    t_df = df.transpose()
    t_df.rename(columns = {0: 'prob'}, inplace = True)

    missing_probs = sum(t_df.isnull().sum(1))
    extracted_probs = t_df.shape[0] - missing_probs

    results = {
        'df': t_df,
        'extracted_probs': extracted_probs
    }
    return(results)

In [210]:
msg = results_probs['fabio_fashion_3'].choices[0].message.content
parse_response(msg)

{'df':                  prob
 nicotine vaping   0.0
 cannabis vaping   0.0
 fashion           1.0
 entertainment     0.1
 lifestyle         0.1
 age_0-11         None
 age_12-17        None
 age_18-21        None
 age_22-25         0.5
 age_26-30         0.5
 age_31-40        None
 age_41-50        None
 age_51-60        None
 age_61_plus      None,
 'extracted_probs': 7}

In [208]:
max_tries = 3
for video, data in results_probs.items():
    print("Analyzing", video, "\n")
    msg = data.choices[0].message.content
    print(msg)
    tries = 0
    while tries < max_tries:
        tries = tries + 1
        result = parse_response(msg)
        if result['extracted_probs'] > 5:
            break
        print("Retrying... only found", result['extracted_probs'], "probabilities.\n")
        
    print(result['df'], "\n")

Analyzing fabio_fashion_3 

Nicotine vaping: $0%$
Cannabis vaping: $0%$
Fashion: $100%$
Entertainment: $10%$
Technology: $0%$
Active lifestyle: $10%$

Age group 22-25: $50%$
Age group 26-30: $50%$
                 prob
nicotine vaping   0.0
cannabis vaping   0.0
fashion           1.0
entertainment     0.1
lifestyle         0.1
age_0-11         None
age_12-17        None
age_18-21        None
age_22-25         0.5
age_26-30         0.5
age_31-40        None
age_41-50        None
age_51-60        None
age_61_plus      None 

Analyzing chamillioneyes_fashion_1 

Nicotine vaping: $0%$
Cannabis vaping: $0%$
Fashion: $90%$
Entertainment: $20%$
Technology: $0%$
Active lifestyle: $0%$

Age 0-11: $0%$
Age 12-17: $0%$
Age 18-21: $20%$
Age 22-25: $50%$
Age 26-30: $30%$
Age 31-40: $0%$
Age 41-50: $0%$
Age 51-60: $0%$
Older than 60: $0%$
                 prob
nicotine vaping   0.0
cannabis vaping   0.0
fashion           0.9
entertainment     0.2
lifestyle         0.0
age_0-11          0.0
age_12-17

In [232]:
from openai import RateLimitError

def analyze_videos_probs(videos, prompt, max_frames = 4, min_tries = 1, max_tries = 3, min_extracted_probs = 5, verbose = False):
    results = {}

    stop_execution = False
    for video in videos:
        video_name = video.stem
        if (verbose):
            print(f"\nAnalyzing {video_name}")
        # Import video
        frames = load_video(str(video))

        tries = 0

        while tries < max_tries:
            tries = tries + 1

            parsed_result = None
            num_probs = 0
            
            # Run query
            try:
                query_result = query_video(frames, prompt, verbose = verbose, max_frames = max_frames)
                msg = query_result.choices[0].message.content
                parsed_result = parse_response(msg)
                if parsed_result is not None:
                    num_probs = parsed_result['extracted_probs']
            except RateLimitError as e:
                print("Ran into rate limit exception:")
                print(e)
                print("Stopping execution")
                stop_execution = True
                query_result = None
                msg = None
                break
            
            if num_probs >= min_extracted_probs:
                break
            
            print("Retrying... only found", num_probs, "probabilities.\n")

        if verbose and not stop_execution:
            print(parsed_result)
            
        # Save results
        results[video_name] = {
            'query': query_result,
            'msg': msg,
            'parsed': parsed_result,
            'tries': tries
        }

        if stop_execution:
            break
    return(results)        
    

In [233]:
%%time

results_probs = analyze_videos_probs(videos, prompt_probs, verbose = True)


Analyzing fabio_fashion_3
406 frames read.
Nicotine vaping: $0%$
Cannabis vaping: $0%$
Fashion: $100%$
Entertainment: $10%$
Technology: $0%$
Active lifestyle: $10%$

Age group 22-25: $50%$
Age group 26-30: $50%$
{'df':                  prob
nicotine vaping   0.0
cannabis vaping   0.0
fashion           1.0
entertainment     0.1
lifestyle         0.1
age_0-11         None
age_12-17        None
age_18-21        None
age_22-25         0.5
age_26-30         0.5
age_31-40        None
age_41-50        None
age_51-60        None
age_61_plus      None, 'extracted_probs': 7}

Analyzing chamillioneyes_fashion_1
386 frames read.
Nicotine vaping: $0%$
Cannabis vaping: $0%$
Fashion: $90%$
Entertainment: $20%$
Technology: $0%$
Active lifestyle: $0%$

Age 0-11: $0%$
Age 12-17: $0%$
Age 18-21: $20%$
Age 22-25: $50%$
Age 26-30: $30%$
Age 31-40: $0%$
Age 41-50: $0%$
Age 51-60: $0%$
Older than 60: $0%$
{'df':                  prob
nicotine vaping   0.0
cannabis vaping   0.0
fashion           0.9
entertai

In [234]:
len(results_probs)

102

## Convert extracted probabilities into a complete dataframe

In [236]:
results_probs.keys()

dict_keys(['fabio_fashion_3', 'chamillioneyes_fashion_1', 'fabio_fashion_2', 'fabio_fashion_18', 'fabio_fashion_19', 'fabio_fashion_14', 'fabio_fashion_4', 'fabio_fashion_17', 'fabio_fashion_8', 'fabio_fashion_9', 'fabio_fashion_12', 'fabio_fashion_1', 'fabio_fashion_16', 'fabio_fashion_11', 'fabio_fashion_7', 'fabio_fashion_15', 'fabio_fashion_6', 'fabio_fashion_13', 'fabio_fashion_10_andpod', 'fabio_fashion_5', 'fabio_health_3', 'drewdirps_health_2', 'fabio_health_2_and_pod', 'arabella_health_1', 'chamil_health_vape', 'chamillioneyes_health_1', 'edripps_health_vape', 'drewdirps_health_4', 'drewdirps_health_1', 'vapes_aby_health_1_andvape', 'vapes_aby_health_2_andvape', 'alex_health_vape', 'drew_health_vape', 'chamillioneyes_health_2', 'vapes_aby_health_vape', 'fabio_health_1', 'drewdirps_health_3', 'alex_health_2', 'arabella_health_2', 'alex_health_1', 'drewdirps_ecigs_3', 'chamillioneyes_ecgis_1', 'drewdirps_ecigs_4', 'edripss_ecgis_2', 'chamillioneyes_ecigs_2', 'calitrickzz_ecigs_3

In [237]:
results_probs['fabio_fashion_3']

{'query': ChatCompletion(id='chatcmpl-9OsuyLHbE3C13VNmoL2waQbIX701Y', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Nicotine vaping: $0%$\nCannabis vaping: $0%$\nFashion: $100%$\nEntertainment: $10%$\nTechnology: $0%$\nActive lifestyle: $10%$\n\nAge group 22-25: $50%$\nAge group 26-30: $50%$', role='assistant', function_call=None, tool_calls=None))], created=1715718004, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_927397958d', usage=CompletionUsage(completion_tokens=63, prompt_tokens=3936, total_tokens=3999)),
 'msg': 'Nicotine vaping: $0%$\nCannabis vaping: $0%$\nFashion: $100%$\nEntertainment: $10%$\nTechnology: $0%$\nActive lifestyle: $10%$\n\nAge group 22-25: $50%$\nAge group 26-30: $50%$',
 'parsed': {'df':                  prob
  nicotine vaping   0.0
  cannabis vaping   0.0
  fashion           1.0
  entertainment     0.1
  lifestyle         0.1
  age_0-11         None
  age_12-17        None
  

In [259]:
video_name = 'fabio_fashion_3'
cur_result = results_probs[video_name]
cur_df = cur_result['parsed']['df']

In [264]:
extra_df = pd.DataFrame({'video': video_name,
                         'extracted_probs': cur_result['parsed']['extracted_probs']
                        }, index = [0])
extra_df

Unnamed: 0,video,extracted_probs
0,fabio_fashion_3,7


In [265]:
t_df = cur_df.T
t_df

Unnamed: 0,nicotine vaping,cannabis vaping,fashion,entertainment,lifestyle,age_0-11,age_12-17,age_18-21,age_22-25,age_26-30,age_31-40,age_41-50,age_51-60,age_61_plus
prob,0.0,0.0,1.0,0.1,0.1,,,,0.5,0.5,,,,


In [266]:
new_df = pd.concat([extra_df, cur_df.T.reset_index(drop = True)], axis = 1, join = 'outer')
new_df

Unnamed: 0,video,extracted_probs,nicotine vaping,cannabis vaping,fashion,entertainment,lifestyle,age_0-11,age_12-17,age_18-21,age_22-25,age_26-30,age_31-40,age_41-50,age_51-60,age_61_plus
0,fabio_fashion_3,7,0.0,0.0,1.0,0.1,0.1,,,,0.5,0.5,,,,


In [273]:
new_dfs = []
for video_name, result in results_probs.items():
    cur_df = result['parsed']['df']
    extra_df = pd.DataFrame({'video': video_name,
                         'extracted_probs': result['parsed']['extracted_probs']
                        }, index = [0])
    new_df = pd.concat([extra_df, cur_df.T.reset_index(drop = True)], axis = 1, join = 'outer')
    new_dfs.append(new_df)
len(new_dfs)

102

In [274]:
result_df = pd.concat(new_dfs, axis = 0)
result_df.shape

(102, 16)

In [275]:
result_df.head()

Unnamed: 0,video,extracted_probs,nicotine vaping,cannabis vaping,fashion,entertainment,lifestyle,age_0-11,age_12-17,age_18-21,age_22-25,age_26-30,age_31-40,age_41-50,age_51-60,age_61_plus
0,fabio_fashion_3,7,0.0,0.0,1.0,0.1,0.1,,,,0.5,0.5,,,,
0,chamillioneyes_fashion_1,13,0.0,0.0,0.9,0.2,0.0,0.0,0.0,0.2,0.5,0.3,0.0,0.0,0.0,
0,fabio_fashion_2,0,,,,,,,,,,,,,,
0,fabio_fashion_18,13,0.0,0.0,1.0,0.1,0.2,0.0,0.0,0.1,0.4,0.4,0.1,0.0,0.0,
0,fabio_fashion_19,0,,,,,,,,,,,,,,


In [320]:
# Non-missing percentages for each column
1 - result_df.isnull().mean()

video              1.000000
extracted_probs    1.000000
nicotine vaping    0.607843
cannabis vaping    0.607843
fashion            0.617647
entertainment      0.617647
lifestyle          0.607843
age_0-11           0.617647
age_12-17          0.686275
age_18-21          0.686275
age_22-25          0.843137
age_26-30          0.784314
age_31-40          0.676471
age_41-50          0.627451
age_51-60          0.598039
age_61_plus        0.000000
dtype: float64

### Merge in labels

In [277]:
df['video'] = df.path.str.extract(r'/([^/]+).mp4$')

In [278]:
df.head()

Unnamed: 0,path,theme,video
0,videos/GPT4_themes/fashion/fabio_fashion_3.mp4,fashion,fabio_fashion_3
1,videos/GPT4_themes/fashion/chamillioneyes_fash...,fashion,chamillioneyes_fashion_1
2,videos/GPT4_themes/fashion/fabio_fashion_2.mp4,fashion,fabio_fashion_2
3,videos/GPT4_themes/fashion/fabio_fashion_18.mp4,fashion,fabio_fashion_18
4,videos/GPT4_themes/fashion/fabio_fashion_19.mp4,fashion,fabio_fashion_19


In [318]:
df.theme.value_counts()

theme
ecigs         22
fashion       20
health        20
technology    20
marijuana     20
Name: count, dtype: int64

In [279]:
df2 = df.merge(result_df, on = 'video')
df2.head()

Unnamed: 0,path,theme,video,extracted_probs,nicotine vaping,cannabis vaping,fashion,entertainment,lifestyle,age_0-11,age_12-17,age_18-21,age_22-25,age_26-30,age_31-40,age_41-50,age_51-60,age_61_plus
0,videos/GPT4_themes/fashion/fabio_fashion_3.mp4,fashion,fabio_fashion_3,7,0.0,0.0,1.0,0.1,0.1,,,,0.5,0.5,,,,
1,videos/GPT4_themes/fashion/chamillioneyes_fash...,fashion,chamillioneyes_fashion_1,13,0.0,0.0,0.9,0.2,0.0,0.0,0.0,0.2,0.5,0.3,0.0,0.0,0.0,
2,videos/GPT4_themes/fashion/fabio_fashion_2.mp4,fashion,fabio_fashion_2,0,,,,,,,,,,,,,,
3,videos/GPT4_themes/fashion/fabio_fashion_18.mp4,fashion,fabio_fashion_18,13,0.0,0.0,1.0,0.1,0.2,0.0,0.0,0.1,0.4,0.4,0.1,0.0,0.0,
4,videos/GPT4_themes/fashion/fabio_fashion_19.mp4,fashion,fabio_fashion_19,0,,,,,,,,,,,,,,


## Evaluate accuracy and AUC

In [289]:
import sklearn, sklearn.metrics

In [295]:
true_val = (df2['theme'] == 'fashion').astype(int)
theme_col

'fashion'

In [300]:
predicted_val = df2[theme_col].astype(float)

In [305]:
analysis_df = pd.concat([true_val, predicted_val], axis = 1)

In [309]:
analysis_df.shape
analysis_df.dropna(inplace = True)
analysis_df.head()

Unnamed: 0,theme,fashion
0,1,1.0
1,1,0.9
3,1,1.0
5,1,1.0
7,1,0.9


In [314]:
# Via https://gist.github.com/doraneko94/e24643136cfb8baf03ef8a314ab9615c
# Formula based on these articles:
# Hanley and McNeil, The meaning and use of the area under a receiver operating characteristic (ROC) curve. Radiology (1982) 43 (1) pp. 29-36.
# Fogarty, Baker and Hudson, Case Studies in the use of ROC Curve Analysis for Sensor-Based Estimates in Human Computer Interaction, Proceedings of Graphics Interface (2005) pp. 129-136.

from sklearn.metrics import roc_auc_score
from math import sqrt

def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, upper)

In [317]:
themes = df2.theme.unique()
theme_cols = {'fashion': 'fashion',
              'health': 'lifestyle'}

for theme, theme_col in theme_cols.items():
    print(f"Analyzing {theme}")
    true_val = (df2['theme'] == theme).astype(int)
    print("Theme prevalence (original df):", np.mean(true_val))
    predicted_val = df2[theme_col].astype(float)
    analysis_df = pd.concat([true_val, predicted_val], axis = 1)
    # Drop rows missing the predicted prob or the true label (hopefully none are missing the true label).
    analysis_df.dropna(inplace = True)
    print("Theme prevalence (analysis df):", np.mean(analysis_df['theme'].values))
    print("Analyzed sample size:", analysis_df.shape[0])
    roc_result = sklearn.metrics.roc_auc_score(analysis_df['theme'], analysis_df[theme_col])
    print(roc_result)
    auc_ci = roc_auc_ci(analysis_df['theme'], analysis_df[theme_col])
    print(auc_ci)

Analyzing fashion
Theme prevalence (original df): 0.19607843137254902
Theme prevalence (analysis df): 0.2222222222222222
Analyzed sample size: 63
0.9948979591836735
(0.9682376439787063, 1)
Analyzing health
Theme prevalence (original df): 0.19607843137254902
Theme prevalence (analysis df): 0.16129032258064516
Analyzed sample size: 62
0.9500000000000001
(0.8539780997613369, 1)


In [312]:
analysis_df

Unnamed: 0,theme,lifestyle
0,0,0.10
1,0,0.00
3,0,0.20
5,0,0.10
7,0,0.20
...,...,...
93,0,0.00
94,0,0.00
95,0,0.05
96,0,0.10
