In [75]:
import pandas as pd
from tqdm import tqdm
from functools import reduce


from src.models.llm_call_helpers import *

# Load data

Load the videos

In [76]:
videos = pd.read_csv('data/bb_videos_title_around_declines1.csv', index_col=(0)).reset_index(drop=True)

videos

Unnamed: 0,channel_id,week,title
0,UCzVw9odnihM5PgKSv5UnDPA,247,Most Funny Babies and Kids Playing in Water - ...
1,UCzVw9odnihM5PgKSv5UnDPA,246,Kids and Babies Trying to do Exercises Funny...
2,UCzVw9odnihM5PgKSv5UnDPA,246,Cute and Adorable Moments of kids meeting newb...
3,UCzVw9odnihM5PgKSv5UnDPA,245,Cute Baby and His Daughter playing and Laughing
4,UCzVw9odnihM5PgKSv5UnDPA,244,Kids and Babies Meeting Animals in Village and...
...,...,...,...
107612,UCs06q9pyRn8d8xy0NgzKPrA,216,Tay K Allegedly Started a new gang in Jail cal...
107613,UCs06q9pyRn8d8xy0NgzKPrA,216,YNW Melly being investigated for a THIRD Murde...
107614,UCs06q9pyRn8d8xy0NgzKPrA,215,Face To Face: Lil Reese x DJ Akademiks: Talks ...
107615,UCrwmu-gceGOmtZeuTsn7DlQ,193,The Black Eyed Peas - BIG LOVE


Load the results of the LLM

In [77]:
df_apology = pd.read_csv('data/bb_apology.csv', index_col=0)
df_clickbait = pd.read_csv('data/bb_clickbait.csv', index_col=0)
df_break = pd.read_csv('data/bb_break.csv', index_col=0)
df_comeback = pd.read_csv('data/bb_comeback.csv', index_col=0)
df_featuring = pd.read_csv('data/bb_featuring.csv', index_col=0)
df_decline_addressed = pd.read_csv('data/bb_decline_addressed.csv', index_col=0)

# Merge LLM results together

In [78]:
df_videos_llm = pd.concat(
    [
        videos,
        df_apology['apology'],
        df_clickbait['clickbait'],
        df_break['break'],
        df_comeback['comeback'],
        df_featuring['featuring'],
        df_decline_addressed['decline_addressed']
    ],
    axis=1
)

df_videos_llm

Unnamed: 0,channel_id,week,title,apology,clickbait,break,comeback,featuring,decline_addressed
0,UCzVw9odnihM5PgKSv5UnDPA,247,Most Funny Babies and Kids Playing in Water - ...,False,True,False,False,False,False
1,UCzVw9odnihM5PgKSv5UnDPA,246,Kids and Babies Trying to do Exercises Funny...,False,True,False,False,False,False
2,UCzVw9odnihM5PgKSv5UnDPA,246,Cute and Adorable Moments of kids meeting newb...,False,True,False,False,False,False
3,UCzVw9odnihM5PgKSv5UnDPA,245,Cute Baby and His Daughter playing and Laughing,False,False,False,False,False,False
4,UCzVw9odnihM5PgKSv5UnDPA,244,Kids and Babies Meeting Animals in Village and...,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
107612,UCs06q9pyRn8d8xy0NgzKPrA,216,Tay K Allegedly Started a new gang in Jail cal...,False,True,False,False,False,False
107613,UCs06q9pyRn8d8xy0NgzKPrA,216,YNW Melly being investigated for a THIRD Murde...,False,True,False,False,False,False
107614,UCs06q9pyRn8d8xy0NgzKPrA,215,Face To Face: Lil Reese x DJ Akademiks: Talks ...,False,True,False,False,True,False
107615,UCrwmu-gceGOmtZeuTsn7DlQ,193,The Black Eyed Peas - BIG LOVE,False,False,False,False,False,False


# Analysis of the results

In [79]:
# Observation for each LLM results

llm_columns = ['apology', 'clickbait', 'break', 'comeback', 'featuring', 'decline_addressed']

for col in llm_columns:
    print(col)
    print(df_videos_llm[col].value_counts())
    print()

apology
apology
False    107419
True        198
Name: count, dtype: int64

clickbait
clickbait
True     78354
False    29262
To           1
Name: count, dtype: int64

break
break
False    107047
True        570
Name: count, dtype: int64

comeback
comeback
False    106714
True        903
Name: count, dtype: int64

featuring
featuring
False    103463
True       4150
I             1
I'm           1
Not           1
To            1
Name: count, dtype: int64

decline_addressed
decline_addressed
False    104940
True       2676
I             1
Name: count, dtype: int64



In [80]:
# Clean the results

def clean_result(result):
    if result == True or result == 'True':
        return int(1)
    elif result == False or result == 'False':
        return int(0)
    else:
        return None

for col in llm_columns:
    df_videos_llm[col] = df_videos_llm[col].apply(clean_result)

df_videos_llm

Unnamed: 0,channel_id,week,title,apology,clickbait,break,comeback,featuring,decline_addressed
0,UCzVw9odnihM5PgKSv5UnDPA,247,Most Funny Babies and Kids Playing in Water - ...,0,1.0,0,0,0.0,0.0
1,UCzVw9odnihM5PgKSv5UnDPA,246,Kids and Babies Trying to do Exercises Funny...,0,1.0,0,0,0.0,0.0
2,UCzVw9odnihM5PgKSv5UnDPA,246,Cute and Adorable Moments of kids meeting newb...,0,1.0,0,0,0.0,0.0
3,UCzVw9odnihM5PgKSv5UnDPA,245,Cute Baby and His Daughter playing and Laughing,0,0.0,0,0,0.0,0.0
4,UCzVw9odnihM5PgKSv5UnDPA,244,Kids and Babies Meeting Animals in Village and...,0,1.0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
107612,UCs06q9pyRn8d8xy0NgzKPrA,216,Tay K Allegedly Started a new gang in Jail cal...,0,1.0,0,0,0.0,0.0
107613,UCs06q9pyRn8d8xy0NgzKPrA,216,YNW Melly being investigated for a THIRD Murde...,0,1.0,0,0,0.0,0.0
107614,UCs06q9pyRn8d8xy0NgzKPrA,215,Face To Face: Lil Reese x DJ Akademiks: Talks ...,0,1.0,0,0,1.0,0.0
107615,UCrwmu-gceGOmtZeuTsn7DlQ,193,The Black Eyed Peas - BIG LOVE,0,0.0,0,0,0.0,0.0


In [81]:
# Count the number of None values for each LLM result
null_counts = pd.DataFrame({
    'Column': llm_columns,
    'None Count': [df_videos_llm[col].isnull().sum() for col in llm_columns]
})
print(null_counts)

              Column  None Count
0            apology           0
1          clickbait           1
2              break           0
3           comeback           0
4          featuring           4
5  decline_addressed           1


Since the number of None values is very low, we can remove the corresponding videos from the dataset.

In [83]:
df_videos_llm = df_videos_llm.dropna()
df_videos_llm

Unnamed: 0,channel_id,week,title,apology,clickbait,break,comeback,featuring,decline_addressed
0,UCzVw9odnihM5PgKSv5UnDPA,247,Most Funny Babies and Kids Playing in Water - ...,0,1.0,0,0,0.0,0.0
1,UCzVw9odnihM5PgKSv5UnDPA,246,Kids and Babies Trying to do Exercises Funny...,0,1.0,0,0,0.0,0.0
2,UCzVw9odnihM5PgKSv5UnDPA,246,Cute and Adorable Moments of kids meeting newb...,0,1.0,0,0,0.0,0.0
3,UCzVw9odnihM5PgKSv5UnDPA,245,Cute Baby and His Daughter playing and Laughing,0,0.0,0,0,0.0,0.0
4,UCzVw9odnihM5PgKSv5UnDPA,244,Kids and Babies Meeting Animals in Village and...,0,1.0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
107612,UCs06q9pyRn8d8xy0NgzKPrA,216,Tay K Allegedly Started a new gang in Jail cal...,0,1.0,0,0,0.0,0.0
107613,UCs06q9pyRn8d8xy0NgzKPrA,216,YNW Melly being investigated for a THIRD Murde...,0,1.0,0,0,0.0,0.0
107614,UCs06q9pyRn8d8xy0NgzKPrA,215,Face To Face: Lil Reese x DJ Akademiks: Talks ...,0,1.0,0,0,1.0,0.0
107615,UCrwmu-gceGOmtZeuTsn7DlQ,193,The Black Eyed Peas - BIG LOVE,0,0.0,0,0,0.0,0.0


# LLM Results Analysis

Now that we have the clean results of the LLM, we can analyze them statistically.

In [87]:
true_proportions = pd.DataFrame({
    'Column': llm_columns,
    'Proportion of True (%)': [df_videos_llm[col].mean() * 100 for col in llm_columns]
}).round(2)

print(true_proportions)

              Column  Proportion of True (%)
0            apology                    0.18
1          clickbait                   72.81
2              break                    0.53
3           comeback                    0.84
4          featuring                    3.86
5  decline_addressed                    2.49
