# Qwen-Audio-Chat

Following https://huggingface.co/Qwen/Qwen-Audio-Chat

In [69]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)



<torch._C.Generator at 0x7fee3d2f7430>

In [70]:
# Note: The default behavior now has injection attack prevention off.
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-Audio-Chat", trust_remote_code=True)

# use bf16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
# use fp16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
# use cpu only
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="cpu", trust_remote_code=True).eval()
# use cuda device
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="cuda", trust_remote_code=True).eval()

# Specify hyperparameters for generation (No need to do this if you are using transformers>4.32.0)
# model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-Audio-Chat", trust_remote_code=True)

audio_start_id: 155163, audio_end_id: 155164, audio_pad_id: 151851.


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

## Examples

## Our data

In [3]:
query = tokenizer.from_list_format([
    {'audio': 'audio/fatalvapes_asmr_narration.mp4'}, # Either a local path or an url
    {'text': 'what does the person say?'},
])
response, history = model.chat(tokenizer, query=query, history=None)
print(response)

The person says in English: "Masking X-pan Lush Ice".


In [4]:
# 2nd dialogue turn
response, history = model.chat(tokenizer, 'Is there rhythmic tapping or noises in the background?', history=history)
print(response)

Yes, there is rhythmic tapping in the background.


In [10]:
response, history = \
    model.chat(tokenizer,
               #'Is there rhythmic tapping or non-word noises in the background? Limit your response to a single word that is the percentage probability for how certain you are.',
               'Is there rhythmic tapping or non-word noises in the background, excluding vocals or music?' +
               'Limit your response to a single word that is the percentage probability for how certain you are.',
    history = history)
print(response)

Yes, there is rhythmic tapping in the background at a probability of 80%.


In [9]:
# Compare to a non-ASMR audio sample which contains music:
query = tokenizer.from_list_format([
    {'audio': 'videos/vaping_modeling.mp4'}, # Either a local path or an url
    {'text': #'Is there rhythmic tapping or non-word noises in the background? Provide your response as a numeric probability.'
              'Is there rhythmic tapping or non-word noises in the background, ignoring vocals or music?' + 
              'Limit your response to a single word that is the percentage probability for how certain you are.'},
])
response, history = model.chat(tokenizer, query=query, history=None)
print(response)

Yes, there is rhythmic tapping in the background at a probability of 80%.


#### Query function

In [78]:
# Compare to a non-ASMR audio sample which contains music:
def query_file(file_path, query_text, history = None):
    query = tokenizer.from_list_format([
        {'audio': file_path}, # Either a local path or an url
        {'text': query_text},
    ])
    response, history = model.chat(tokenizer, query = query, history = history)
    return(response, history)

prompt_asmr = ('Is there rhythmic tapping or non-word noises in the background, ignoring vocals or music?' + 
              'Limit your response to a single word that is the percentage probability for how certain you are, like 5% or 95%.')

response, history = query_file('videos/vaping_modeling.mp4',
           query_text = prompt_asmr)
response

'Yes, there is rhythmic tapping in the background.'

## Performance evaluation

### Import labels

In [8]:
# pip install "pandas<2.0.0"
import pandas as pd
# https://docs.google.com/spreadsheets/d/1r9IlI6dze2Okh4EQ5v1ZhGut1eTkzkc_/edit#gid=42652425
labels = pd.read_excel("audio/ASMR_data_labeled.xlsx")
labels.rename(columns = {'ASMR (yes/no)': 'is_asmr'}, inplace = True)
labels['shortcode'] = labels.videoURL.str.extract('\/([^/]+)/$')
labels['is_asmr'] = (labels.is_asmr.values == 'yes').astype(int)
labels.head()

Unnamed: 0,profile,videoURL,video_posted_date,is_asmr,shortcode
0,noniquennisa,https://www.instagram.com/reel/CnBxPwxDlG4/,2023-01-05 00:00:00,0,CnBxPwxDlG4
1,noniquennisa,https://www.instagram.com/reel/CnEWAuGjhyg/,2023-01-06 00:00:00,0,CnEWAuGjhyg
2,noniquennisa,https://www.instagram.com/reel/CnG7P4cjdmz/,2023-01-07 00:00:00,0,CnG7P4cjdmz
3,noniquennisa,https://www.instagram.com/reel/CnMEacLjnko/,2023-01-09 00:00:00,1,CnMEacLjnko
4,noniquennisa,https://www.instagram.com/reel/CnOpQ3uDtr5/,2023-01-10 00:00:00,0,CnOpQ3uDtr5


In [55]:
print(labels.isna().sum())
labels.dropna(inplace = True) # 2 blank rows
print(labels.isna().sum())

profile              2
videoURL             2
video_posted_date    2
is_asmr              0
shortcode            2
dtype: int64
profile              0
videoURL             0
video_posted_date    0
is_asmr              0
shortcode            0
dtype: int64


### Import video names

In [56]:
import pickle

# Requires pandas 1 rather than 2, per above pip line.
df = pickle.load(open("audio/vape_file_dictionary.pkl", 'rb'))
df['shortcode'] = df.web_url.str.extract('\/([^/]+)/$')
print(df.shape)
# We drop 3 rows here.
df.dropna(inplace = True)
print(df.shape)
df.head()

(477, 3)
(474, 3)


Unnamed: 0,web_url,file,shortcode
21592,https://www.instagram.com/p/C4KqjIqyt0b/,noniquennisa/reels/2024-03-06 16.49.28 3317651...,C4KqjIqyt0b
21593,https://www.instagram.com/p/C3e5J1pSAvk/,noniquennisa/reels/2024-02-18 16.49.18 3305330...,C3e5J1pSAvk
21594,https://www.instagram.com/p/C3R_pr6yeU-/,noniquennisa/reels/2024-02-13 16.36.54 3301699...,C3R_pr6yeU-
21595,https://www.instagram.com/p/C3PdlEbJiYw/,noniquennisa/reels/2024-02-12 16.59.35 3300987...,C3PdlEbJiYw
21596,https://www.instagram.com/p/C3CkwciS6-Q/,noniquennisa/reels/2024-02-07 16.56.25 3297359...,C3CkwciS6-Q


### Integrate videos and labels

In [63]:
df2 = labels.merge(df, on = "shortcode", how = 'left')
# 437 - this is correct.
print(df2.shape)
print(df2.is_asmr.value_counts())
df2.head()

(437, 7)
0    329
1    108
Name: is_asmr, dtype: int64


Unnamed: 0,profile,videoURL,video_posted_date,is_asmr,shortcode,web_url,file
0,noniquennisa,https://www.instagram.com/reel/CnBxPwxDlG4/,2023-01-05 00:00:00,0,CnBxPwxDlG4,https://www.instagram.com/p/CnBxPwxDlG4/,noniquennisa/reels/2023-01-05 17.00.12 3008902...
1,noniquennisa,https://www.instagram.com/reel/CnEWAuGjhyg/,2023-01-06 00:00:00,0,CnEWAuGjhyg,https://www.instagram.com/p/CnEWAuGjhyg/,noniquennisa/reels/2023-01-06 17.00.17 3009627...
2,noniquennisa,https://www.instagram.com/reel/CnG7P4cjdmz/,2023-01-07 00:00:00,0,CnG7P4cjdmz,https://www.instagram.com/p/CnG7P4cjdmz/,noniquennisa/reels/2023-01-07 17.03.18 3010353...
3,noniquennisa,https://www.instagram.com/reel/CnMEacLjnko/,2023-01-09 00:00:00,1,CnMEacLjnko,https://www.instagram.com/p/CnMEacLjnko/,noniquennisa/reels/2023-01-09 16.59.46 3011801...
4,noniquennisa,https://www.instagram.com/reel/CnOpQ3uDtr5/,2023-01-10 00:00:00,0,CnOpQ3uDtr5,https://www.instagram.com/p/CnOpQ3uDtr5/,noniquennisa/reels/2023-01-10 17.00.18 3012526...


In [64]:
df2.isna().sum()

profile              0
videoURL             0
video_posted_date    0
is_asmr              0
shortcode            0
web_url              3
file                 3
dtype: int64

In [65]:
# 3 labels were not scraped - can just exclude these.
print(df2[df2.web_url.isna()])
df2.dropna(inplace = True)
print(df2.isna().sum())

    profile                                     videoURL    video_posted_date  \
157  a_kidz  https://www.instagram.com/reel/Cn3cF83MVYQ/  2023-01-25 00:00:00   
158  a_kidz  https://www.instagram.com/reel/CoC1t59OwHL/  2023-01-30 00:00:00   
162  a_kidz  https://www.instagram.com/reel/Co7CgpwLU2E/  2023-02-21 00:00:00   

     is_asmr    shortcode web_url file  
157        0  Cn3cF83MVYQ     NaN  NaN  
158        0  CoC1t59OwHL     NaN  NaN  
162        0  Co7CgpwLU2E     NaN  NaN  
profile              0
videoURL             0
video_posted_date    0
is_asmr              0
shortcode            0
web_url              0
file                 0
dtype: int64


### Find videos

In [82]:
from pathlib import Path
from pyprojroot import here

df2['found'] = 0

for i, row in df2.iterrows():
    file_str = row['file']
    file_path = Path(str("audio/" + file_str))
    if file_path.is_file():
        df2.at[i, 'found'] = 1

# Found all files.
print(df2.found.value_counts(), '\n')
print(df2[['profile', 'found']].groupby(['profile', 'found']).value_counts())
# No longer any missing files.
# [missing_files[i] for i in list(missing_files.keys())[:10]]

1    434
Name: found, dtype: int64 

profile       found
a_kidz        1        153
fatal_vapes   1        129
noniquennisa  1        152
dtype: int64


### Analyze videos

In [92]:
# Reset results
df2['response'] = ''
df2['successes'] = 0
df2['attempts'] = 0

In [123]:
%%time

import math, re
min_successes = 3
max_tries = 10
for i, row in df2.iterrows():
    # Allow hotstarting.
    if row['response'] != '':
        continue
    file_path = str("audio/" + row['file'])
    attempts = 0
    successes = 0
    probs = []
    responses = []
    while attempts < max_tries and successes < min_successes:
        # Run and parse simultaneously
        response, _ = query_file(file_path,
               query_text = prompt_asmr)
        responses.append(response)
        # Extract the probabily if included in the response.
        result = re.search('(\d+)%', response)
        if result:
            prob = float(result.group(1)) / 100
            if not math.isnan(prob):
                probs.append(prob)
                successes = successes + 1
        attempts = attempts + 1

    df2.at[i, 'response'] = responses
    df2.at[i, 'pred_asmr'] = np.mean(probs)
    df2.at[i, 'attempts'] = attempts
    df2.at[i, 'successes'] = successes
    print("Completed", i, "with", attempts, "attempts and", successes, f"successes. Prob = {np.mean(probs):0.3f}", "Actual:", row['is_asmr']) 
    

Completed 108 with 6 attempts and 3 successes. Prob = 0.667 Actual: 0
Completed 109 with 4 attempts and 3 successes. Prob = 0.040 Actual: 1
Completed 110 with 10 attempts and 2 successes. Prob = 0.975 Actual: 1


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 111 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 112 with 10 attempts and 1 successes. Prob = 0.950 Actual: 1
Completed 113 with 10 attempts and 3 successes. Prob = 0.850 Actual: 1
Completed 114 with 4 attempts and 3 successes. Prob = 0.883 Actual: 0
Completed 115 with 3 attempts and 3 successes. Prob = 0.917 Actual: 1


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 116 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 117 with 4 attempts and 3 successes. Prob = 0.850 Actual: 0
Completed 118 with 10 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 119 with 9 attempts and 3 successes. Prob = 0.300 Actual: 0
Completed 120 with 4 attempts and 3 successes. Prob = 0.883 Actual: 0
Completed 121 with 3 attempts and 3 successes. Prob = 0.590 Actual: 1
Completed 122 with 4 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 123 with 4 attempts and 3 successes. Prob = 0.567 Actual: 0
Completed 124 with 9 attempts and 3 successes. Prob = 0.350 Actual: 0
Completed 125 with 10 attempts and 1 successes. Prob = 0.700 Actual: 1
Completed 126 with 3 attempts and 3 successes. Prob = 0.883 Actual: 1
Completed 127 with 5 attempts and 3 successes. Prob = 0.650 Actual: 1
Completed 128 with 10 attempts and 2 successes. Prob = 0.950 Actual: 1
Completed 129 with 6 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 130 with

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 147 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 148 with 10 attempts and 1 successes. Prob = 0.700 Actual: 0
Completed 149 with 10 attempts and 1 successes. Prob = 0.900 Actual: 1
Completed 150 with 4 attempts and 3 successes. Prob = 0.850 Actual: 1
Completed 151 with 4 attempts and 3 successes. Prob = 0.200 Actual: 0
Completed 152 with 3 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 153 with 3 attempts and 3 successes. Prob = 0.950 Actual: 0
Completed 154 with 5 attempts and 3 successes. Prob = 0.350 Actual: 0
Completed 155 with 3 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 156 with 4 attempts and 3 successes. Prob = 0.917 Actual: 0
Completed 159 with 7 attempts and 3 successes. Prob = 0.850 Actual: 0
Completed 160 with 4 attempts and 3 successes. Prob = 0.900 Actual: 0
Completed 161 with 4 attempts and 3 successes. Prob = 0.267 Actual: 0
Completed 163 with 8 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 164 with 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 182 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 183 with 3 attempts and 3 successes. Prob = 0.570 Actual: 1
Completed 184 with 3 attempts and 3 successes. Prob = 0.950 Actual: 0
Completed 185 with 10 attempts and 2 successes. Prob = 0.800 Actual: 0
Completed 186 with 5 attempts and 3 successes. Prob = 0.637 Actual: 0
Completed 187 with 3 attempts and 3 successes. Prob = 0.650 Actual: 0
Completed 188 with 3 attempts and 3 successes. Prob = 0.583 Actual: 0
Completed 189 with 3 attempts and 3 successes. Prob = 0.767 Actual: 0
Completed 190 with 3 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 191 with 3 attempts and 3 successes. Prob = 0.933 Actual: 1
Completed 192 with 10 attempts and 1 successes. Prob = 0.700 Actual: 0
Completed 193 with 3 attempts and 3 successes. Prob = 0.600 Actual: 1
Completed 194 with 5 attempts and 3 successes. Prob = 0.500 Actual: 0
Completed 195 with 10 attempts and 1 successes. Prob = 0.950 Actual: 0
Completed 196 with

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 241 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 242 with 5 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 243 with 3 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 244 with 3 attempts and 3 successes. Prob = 0.637 Actual: 0
Completed 245 with 3 attempts and 3 successes. Prob = 0.320 Actual: 0
Completed 246 with 4 attempts and 3 successes. Prob = 0.620 Actual: 0
Completed 247 with 6 attempts and 3 successes. Prob = 0.273 Actual: 1
Completed 248 with 5 attempts and 3 successes. Prob = 0.587 Actual: 1
Completed 249 with 3 attempts and 3 successes. Prob = 0.883 Actual: 0
Completed 250 with 7 attempts and 3 successes. Prob = 0.817 Actual: 1
Completed 251 with 3 attempts and 3 successes. Prob = 0.783 Actual: 0
Completed 252 with 4 attempts and 3 successes. Prob = 0.317 Actual: 0
Completed 253 with 8 attempts and 3 successes. Prob = 0.573 Actual: 0
Completed 254 with 4 attempts and 3 successes. Prob = 0.900 Actual: 1
Completed 255 with 5 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 273 with 10 attempts and 0 successes. Prob = nan Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 274 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 275 with 4 attempts and 3 successes. Prob = 0.850 Actual: 0
Completed 276 with 3 attempts and 3 successes. Prob = 0.933 Actual: 0
Completed 277 with 3 attempts and 3 successes. Prob = 0.900 Actual: 1
Completed 278 with 3 attempts and 3 successes. Prob = 0.027 Actual: 0
Completed 279 with 3 attempts and 3 successes. Prob = 0.750 Actual: 0
Completed 280 with 4 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 281 with 3 attempts and 3 successes. Prob = 0.023 Actual: 0
Completed 282 with 3 attempts and 3 successes. Prob = 0.267 Actual: 0
Completed 283 with 6 attempts and 3 successes. Prob = 0.583 Actual: 0
Completed 284 with 10 attempts and 3 successes. Prob = 0.650 Actual: 0
Completed 285 with 10 attempts and 1 successes. Prob = 0.800 Actual: 0
Completed 286 with 8 attempts and 3 successes. Prob = 0.633 Actual: 0
Completed 287 with 3 attempts and 3 successes. Prob = 0.733 Actual: 1


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 288 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 289 with 3 attempts and 3 successes. Prob = 0.917 Actual: 0
Completed 290 with 6 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 291 with 3 attempts and 3 successes. Prob = 0.540 Actual: 0
Completed 292 with 5 attempts and 3 successes. Prob = 0.833 Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 293 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 294 with 6 attempts and 3 successes. Prob = 0.750 Actual: 0
Completed 295 with 3 attempts and 3 successes. Prob = 0.550 Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 296 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 297 with 5 attempts and 3 successes. Prob = 0.900 Actual: 0
Completed 298 with 3 attempts and 3 successes. Prob = 0.733 Actual: 0
Completed 299 with 4 attempts and 3 successes. Prob = 0.800 Actual: 1
Completed 300 with 8 attempts and 3 successes. Prob = 0.967 Actual: 0
Completed 301 with 3 attempts and 3 successes. Prob = 0.367 Actual: 0
Completed 302 with 4 attempts and 3 successes. Prob = 0.767 Actual: 0
Completed 303 with 5 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 304 with 3 attempts and 3 successes. Prob = 0.883 Actual: 1
Completed 305 with 4 attempts and 3 successes. Prob = 0.817 Actual: 0
Completed 306 with 3 attempts and 3 successes. Prob = 0.833 Actual: 0
Completed 307 with 9 attempts and 3 successes. Prob = 0.313 Actual: 0
Completed 308 with 5 attempts and 3 successes. Prob = 0.817 Actual: 0
Completed 309 with 3 attempts and 3 successes. Prob = 0.600 Actual: 0
Completed 310 with 4 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 330 with 10 attempts and 0 successes. Prob = nan Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 331 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 332 with 7 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 333 with 3 attempts and 3 successes. Prob = 0.283 Actual: 0
Completed 334 with 3 attempts and 3 successes. Prob = 0.917 Actual: 0
Completed 335 with 4 attempts and 3 successes. Prob = 0.950 Actual: 0
Completed 336 with 4 attempts and 3 successes. Prob = 0.783 Actual: 1
Completed 337 with 3 attempts and 3 successes. Prob = 0.483 Actual: 0
Completed 338 with 10 attempts and 1 successes. Prob = 0.900 Actual: 0
Completed 339 with 5 attempts and 3 successes. Prob = 0.267 Actual: 1
Completed 340 with 10 attempts and 1 successes. Prob = 0.950 Actual: 0
Completed 341 with 7 attempts and 3 successes. Prob = 0.617 Actual: 1
Completed 342 with 5 attempts and 3 successes. Prob = 0.537 Actual: 0
Completed 343 with 9 attempts and 3 successes. Prob = 0.027 Actual: 1
Completed 344 with 3 attempts and 3 successes. Prob = 0.933 Actual: 1
Completed 345 with 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 364 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 365 with 3 attempts and 3 successes. Prob = 0.933 Actual: 0
Completed 366 with 7 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 367 with 10 attempts and 1 successes. Prob = 0.850 Actual: 1
Completed 368 with 4 attempts and 3 successes. Prob = 0.950 Actual: 1
Completed 369 with 3 attempts and 3 successes. Prob = 0.583 Actual: 0
Completed 370 with 3 attempts and 3 successes. Prob = 0.950 Actual: 0
Completed 371 with 10 attempts and 1 successes. Prob = 0.900 Actual: 0
Completed 372 with 3 attempts and 3 successes. Prob = 0.933 Actual: 0
Completed 373 with 8 attempts and 3 successes. Prob = 0.633 Actual: 0
Completed 374 with 3 attempts and 3 successes. Prob = 0.550 Actual: 0
Completed 375 with 7 attempts and 3 successes. Prob = 0.267 Actual: 0
Completed 376 with 10 attempts and 2 successes. Prob = 0.825 Actual: 0
Completed 377 with 5 attempts and 3 successes. Prob = 0.333 Actual: 0
Completed 378 with

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 381 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 382 with 4 attempts and 3 successes. Prob = 0.357 Actual: 1
Completed 383 with 3 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 384 with 5 attempts and 3 successes. Prob = 0.283 Actual: 1
Completed 385 with 3 attempts and 3 successes. Prob = 0.900 Actual: 0
Completed 386 with 4 attempts and 3 successes. Prob = 0.617 Actual: 1
Completed 387 with 6 attempts and 3 successes. Prob = 0.550 Actual: 0
Completed 388 with 6 attempts and 3 successes. Prob = 0.483 Actual: 0
Completed 389 with 10 attempts and 3 successes. Prob = 1.000 Actual: 0
Completed 390 with 10 attempts and 3 successes. Prob = 0.883 Actual: 0
Completed 391 with 3 attempts and 3 successes. Prob = 0.650 Actual: 0
Completed 392 with 4 attempts and 3 successes. Prob = 0.633 Actual: 0
Completed 393 with 7 attempts and 3 successes. Prob = 0.900 Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 394 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 395 with 3 attempts and 3 successes. Prob = 0.550 Actual: 0
Completed 396 with 4 attempts and 3 successes. Prob = 0.667 Actual: 0
Completed 397 with 10 attempts and 1 successes. Prob = 0.800 Actual: 0
Completed 398 with 8 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 399 with 3 attempts and 3 successes. Prob = 0.883 Actual: 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 400 with 10 attempts and 0 successes. Prob = nan Actual: 1
Completed 401 with 5 attempts and 3 successes. Prob = 0.950 Actual: 1
Completed 402 with 7 attempts and 3 successes. Prob = 0.867 Actual: 0
Completed 403 with 4 attempts and 3 successes. Prob = 0.650 Actual: 1
Completed 404 with 10 attempts and 2 successes. Prob = 0.050 Actual: 0
Completed 405 with 3 attempts and 3 successes. Prob = 0.633 Actual: 1
Completed 406 with 5 attempts and 3 successes. Prob = 0.900 Actual: 0
Completed 407 with 10 attempts and 1 successes. Prob = 0.900 Actual: 0
Completed 408 with 5 attempts and 3 successes. Prob = 0.617 Actual: 0
Completed 409 with 3 attempts and 3 successes. Prob = 0.633 Actual: 0
Completed 410 with 4 attempts and 3 successes. Prob = 0.350 Actual: 0
Completed 411 with 3 attempts and 3 successes. Prob = 0.900 Actual: 0
Completed 412 with 5 attempts and 3 successes. Prob = 0.633 Actual: 0
Completed 413 with 3 attempts and 3 successes. Prob = 0.483 Actual: 0
Completed 414 with 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Completed 417 with 10 attempts and 0 successes. Prob = nan Actual: 0
Completed 418 with 3 attempts and 3 successes. Prob = 0.800 Actual: 1
Completed 419 with 3 attempts and 3 successes. Prob = 0.550 Actual: 0
Completed 420 with 3 attempts and 3 successes. Prob = 0.933 Actual: 0
Completed 421 with 4 attempts and 3 successes. Prob = 0.583 Actual: 0
Completed 422 with 5 attempts and 3 successes. Prob = 0.833 Actual: 1
Completed 423 with 5 attempts and 3 successes. Prob = 0.933 Actual: 0
Completed 424 with 7 attempts and 3 successes. Prob = 0.050 Actual: 1
Completed 425 with 3 attempts and 3 successes. Prob = 0.950 Actual: 0
Completed 426 with 4 attempts and 3 successes. Prob = 0.983 Actual: 0
Completed 427 with 3 attempts and 3 successes. Prob = 0.567 Actual: 0
Completed 428 with 3 attempts and 3 successes. Prob = 0.550 Actual: 0
Completed 429 with 6 attempts and 3 successes. Prob = 0.917 Actual: 0
Completed 430 with 10 attempts and 3 successes. Prob = 0.667 Actual: 0
Completed 431 with 4

In [114]:
# Calculate AUC and accuracy.
# Via https://gist.github.com/doraneko94/e24643136cfb8baf03ef8a314ab9615c
# Formula based on these articles:
# Hanley and McNeil, The meaning and use of the area under a receiver operating characteristic (ROC) curve. Radiology (1982) 43 (1) pp. 29-36.
# Fogarty, Baker and Hudson, Case Studies in the use of ROC Curve Analysis for Sensor-Based Estimates in Human Computer Interaction, Proceedings of Graphics Interface (2005) pp. 129-136.

import sklearn
from sklearn.metrics import roc_auc_score
from math import sqrt

def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, upper)

In [111]:
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 434 entries, 0 to 436
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   profile            434 non-null    object 
 1   videoURL           434 non-null    object 
 2   video_posted_date  434 non-null    object 
 3   is_asmr            434 non-null    int64  
 4   shortcode          434 non-null    object 
 5   web_url            434 non-null    object 
 6   file               434 non-null    object 
 7   found              434 non-null    int64  
 8   response           434 non-null    object 
 9   successes          434 non-null    int64  
 10  attempts           434 non-null    int64  
 11  pred_asmr          102 non-null    float64
dtypes: float64(1), int64(4), object(7)
memory usage: 60.2+ KB


Unnamed: 0,profile,videoURL,video_posted_date,is_asmr,shortcode,web_url,file,found,response,successes,attempts,pred_asmr
0,noniquennisa,https://www.instagram.com/reel/CnBxPwxDlG4/,2023-01-05 00:00:00,0,CnBxPwxDlG4,https://www.instagram.com/p/CnBxPwxDlG4/,noniquennisa/reels/2023-01-05 17.00.12 3008902...,1,[There is a 95% probability of rhythmic tappin...,3,3,0.95
1,noniquennisa,https://www.instagram.com/reel/CnEWAuGjhyg/,2023-01-06 00:00:00,0,CnEWAuGjhyg,https://www.instagram.com/p/CnEWAuGjhyg/,noniquennisa/reels/2023-01-06 17.00.17 3009627...,1,"[There is rhythmic tapping in the background, ...",3,3,0.636667
2,noniquennisa,https://www.instagram.com/reel/CnG7P4cjdmz/,2023-01-07 00:00:00,0,CnG7P4cjdmz,https://www.instagram.com/p/CnG7P4cjdmz/,noniquennisa/reels/2023-01-07 17.03.18 3010353...,1,[There is a 100% chance of rhythmic tapping or...,3,4,0.8
3,noniquennisa,https://www.instagram.com/reel/CnMEacLjnko/,2023-01-09 00:00:00,1,CnMEacLjnko,https://www.instagram.com/p/CnMEacLjnko/,noniquennisa/reels/2023-01-09 16.59.46 3011801...,1,"[There is rhythmic tapping in the background, ...",3,7,0.933333
4,noniquennisa,https://www.instagram.com/reel/CnOpQ3uDtr5/,2023-01-10 00:00:00,0,CnOpQ3uDtr5,https://www.instagram.com/p/CnOpQ3uDtr5/,noniquennisa/reels/2023-01-10 17.00.18 3012526...,1,"[There is rhythmic tapping in the background, ...",3,5,0.8


In [133]:
#####
# Data prep
true_val = df2.is_asmr
print(f"Prevalence (original df): {np.mean(true_val):0.4f}")
predicted_val = df2.pred_asmr
analysis_df = pd.concat([true_val, predicted_val], axis = 1)
# Drop rows missing the predicted prob or the true label (hopefully none are missing the true label).
analysis_df.dropna(inplace = True)


#####
# Analysis
print(f"Prevalence (analysis df): {np.mean(analysis_df['is_asmr'].values):0.4f}")
print("Analyzed sample size:", analysis_df.shape[0])
roc_result = sklearn.metrics.roc_auc_score(analysis_df['is_asmr'], analysis_df['pred_asmr'])
auc_ci = roc_auc_ci(analysis_df['is_asmr'], analysis_df['pred_asmr'])
print(f"AUC: {roc_result:0.4f} {auc_ci[0]:0.4f}-{auc_ci[1]:0.4f}")
accuracy = np.mean((analysis_df['pred_asmr'] > 0.99).astype(int) == analysis_df['is_asmr'])
print(f"Accuracy: {accuracy:.4f}")

Prevalence (original df): 0.2488
Prevalence (analysis df): 0.2457
Analyzed sample size: 411
AUC: 0.5632 0.4976-0.6289
Accuracy: 0.7518
