# Part 1: Analysis of Ground Truth Data

# This script performs an analysis of the ground truth data, processes it by dropping irrelevant columns,
# and deduces the majority vote for each question per creative data ID. The final deduced data is saved for further use.


In [2]:
import pandas as pd


In [3]:
# Load the text data from the CSV file
text_data = pd.read_csv('Sample.csv')

In [4]:
# Check for any missing values in the text data
text_data.isnull().sum()

creative_data_id                          0
creative_data_title                       0
creative_data_description                 0
creative_data_duration                    0
creative_data_lifetime_spend_estimated    0
creative_data_lifetime_airings_count      0
creative_data_airing_date_first_et        0
creative_data_airing_date_last_et         0
speech                                    0
dtype: int64

In [5]:
# Load the ground truth data from the CSV file
ground_truth = pd.read_csv('groundtruth.csv')

In [6]:
# Check for any missing values in the ground truth data
ground_truth.isnull().sum()

Timestamp                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         0
creative_data_id                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [7]:
# Check the distribution of values in a specific column of the ground truth data
ground_truth['If yes to the above, did the ad successfully affect you emotionally, as intended?'].value_counts()

If yes to the above, did the ad successfully affect you emotionally, as intended?
Yes    211
No     148
Name: count, dtype: int64

In [8]:
# Define a fill value for missing data (though it isn't used in the code snippet)
fill_value = 'N/A'

In [9]:
# Print the shape (number of rows and columns) of the ground truth DataFrame
ground_truth.shape

(449, 36)

In [10]:
# Print the column names of the ground truth DataFrame
ground_truth.columns

Index(['Timestamp', 'creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoe

In [11]:
# Check the distribution of values in the 'creative_data_id' column
ground_truth['creative_data_id'].value_counts()

creative_data_id
2953810    4
1641167    4
1776082    4
1683011    4
1671240    4
          ..
1744482    2
3312710    2
1788954    2
2612386    2
1471363    2
Name: count, Length: 150, dtype: int64

In [12]:
# Display rows in the ground truth data where 'creative_data_id' is 2953810
ground_truth[ground_truth['creative_data_id'] == 2953810]

Unnamed: 0,Timestamp,creative_data_id,"Is there a call to go online (e.g., shop online, visit the Web)?","Is there online contact information provided (e.g., URL, website)?","Is there a visual or verbal call to purchase (e.g., buy now, order now)?","Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?","Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?","Is there offline contact information provided (e.g., phone, mail, store location)?",Is there mention of something free?,"Does the ad mention at least one specific product or service (e.g., model, type, item)?",...,Was there a famous person in this ad?,"If yes to the above, write the name of the famous person, if known.",What happened in this ad? (Answer in 2-3 sentences each),What was/were the company's goal(s) with this ad? Choose (potentially multiple) from:,How successful was the ad in achieving its goal(s)?,"How much did you like the ad? (1. Strongly dislike, 2. Dislike, 3. Neither Like or Dislike, 4. Like, 5. Strongly Like)","What was the slogan presented in the ad, if any?","After addressing the specific survey items, write a general description of the ad. You can use answers to the questions above to formulate your answer. Your description should include:\nBrand and Product Identification: \nSpecify the brand and whether a product is being advertised. (1 sentence)\nVisual Elements: Describe what is seen on the screen, including setting, characters, and any text or graphics. (max 2 sentences)\nAuditory Elements: Note what is heard, such as dialogue, voice-over, music, or sound effects. (max 2 sentences)\n",Any additional feedback or things we should be aware of?,Please enter the video identifier one more time (e.g. 123456789.mp4)
342,5/21/2024 1:57:27,2953810,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,No,,You follow multiple people using Ford vehicles...,"Increase awareness of product/brand, Change wh...",3,3,,Ford is advertising their Ford Promise. We see...,,2953810.mp4
343,5/21/2024 2:03:29,2953810,No,No,No,No,No,No,No,Yes,...,No,,We follow a couple and they are trying to find...,"Increase awareness of product/brand, Directly ...",3,3,no,Nissan is advertising their 2021 model. You wa...,,2953810.mp4
344,5/24/2024 16:28:24,2953810,No,Yes,Yes,Yes,No,No,No,No,...,No,na,"In this ad, a voice over talks about the ford ...",Increase awareness of product/brand,5,4,na,"The brand is ford. In this ad, a voice over ta...",na,2953810.mp4
345,5/28/2024 2:25:01,2953810,No,Yes,Yes,Yes,Yes,Yes,No,Yes,...,No,,Ford is offering a Return plan where the buyer...,"Increase awareness of product/brand, Directly ...",4,4,,The brand is Ford and there is a product being...,,2953810.mp4


In [13]:
# Define a list of columns to drop from the ground truth DataFrame
columns_to_drop = ["Timestamp", "If \"yes\" to the above, which of the following emotions is closest to the emotion that the ad was intending the viewer to feel? (Select all that apply.)", "If yes to the above, did the ad successfully affect you emotionally, as intended?", "If yes to the above, was the ad successfully funny, as intended?", "Was there a famous person in this ad? ", "If yes to the above, write the name of the famous person, if known.", "What happened in this ad? (Answer in 2-3 sentences each)", "What was/were the company's goal(s) with this ad? Choose (potentially multiple) from:", "How successful was the ad in achieving its goal(s)?", "How much did you like the ad? (1. Strongly dislike, 2. Dislike, 3. Neither Like or Dislike, 4. Like, 5. Strongly Like)", "What was the slogan presented in the ad, if any?", "After addressing the specific survey items, write a general description of the ad. You can use answers to the questions above to formulate your answer. Your description should include:\nBrand and Product Identification: \nSpecify the brand and whether a product is being advertised. (1 sentence)\nVisual Elements: Describe what is seen on the screen, including setting, characters, and any text or graphics. (max 2 sentences)\nAuditory Elements: Note what is heard, such as dialogue, voice-over, music, or sound effects. (max 2 sentences)\n", "Any additional feedback or things we should be aware of? ", "Please enter the video identifier one more time (e.g. 123456789.mp4)"]

In [14]:
# Print the column names of the ground truth DataFrame
ground_truth.columns

Index(['Timestamp', 'creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoe

In [15]:
# Drop the specified columns from the ground truth DataFrame
ground_truth = ground_truth.drop(columns=columns_to_drop, axis=1)

In [16]:
# Print the column names of the updated ground truth DataFrame
ground_truth.columns

Index(['creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel

In [17]:
# Print the shape (number of rows and columns) of the updated ground truth DataFrame
ground_truth.shape

(449, 22)

In [18]:
# Check for any missing values in the updated ground truth DataFrame
ground_truth.isnull().sum().sum()

0

In [19]:
# Display rows in the updated ground truth data where 'creative_data_id' is 2953810
ground_truth[ground_truth['creative_data_id'] == 2953810]

Unnamed: 0,creative_data_id,"Is there a call to go online (e.g., shop online, visit the Web)?","Is there online contact information provided (e.g., URL, website)?","Is there a visual or verbal call to purchase (e.g., buy now, order now)?","Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?","Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?","Is there offline contact information provided (e.g., phone, mail, store location)?",Is there mention of something free?,"Does the ad mention at least one specific product or service (e.g., model, type, item)?",Is there any verbal or visual mention of the price?,...,"Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)",Does the ad give you a positive feeling about the brand?,"Does the ad have a story arc, with a beginning and an end?","Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?",Does the ad have relatable characters?,Is the ad creative/clever?,"Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.)","Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)?",Is the ad visually pleasing?,"Does the ad have cute elements like animals, babies, animated, characters, etc?"
342,2953810,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,No,...,Yes,Yes,No,No,No,No,No,Yes,Yes,No
343,2953810,No,No,No,No,No,No,No,Yes,No,...,No,Yes,No,No,Yes,Yes,No,Yes,Yes,No
344,2953810,No,Yes,Yes,Yes,No,No,No,No,No,...,Yes,Yes,No,No,No,No,No,No,No,No
345,2953810,No,Yes,Yes,Yes,Yes,Yes,No,Yes,"Yes, both",...,No,Yes,No,Yes,No,Yes,No,Yes,Yes,No


# Check the distribution of values in specific columns of the updated ground truth DataFrame

In [20]:
ground_truth['Is there a call to go online (e.g., shop online, visit the Web)? '].value_counts()

Is there a call to go online (e.g., shop online, visit the Web)? 
No     301
Yes    148
Name: count, dtype: int64

In [21]:
ground_truth['Is there online contact information provided (e.g., URL, website)? '].value_counts()

Is there online contact information provided (e.g., URL, website)? 
No     242
Yes    207
Name: count, dtype: int64

In [22]:
ground_truth['Is there a visual or verbal call to purchase (e.g., buy now, order now)?'].value_counts()

Is there a visual or verbal call to purchase (e.g., buy now, order now)?
No     260
Yes    189
Name: count, dtype: int64

In [23]:
ground_truth['Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? '].value_counts()

Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? 
No     319
Yes    130
Name: count, dtype: int64

In [24]:
ground_truth['Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? '].value_counts()

Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? 
No     269
Yes    180
Name: count, dtype: int64

In [25]:
ground_truth['Is there offline contact information provided (e.g., phone, mail, store location)?'].value_counts()

Is there offline contact information provided (e.g., phone, mail, store location)?
No     350
Yes     99
Name: count, dtype: int64

In [26]:
ground_truth['Is there mention of something free? '].value_counts()

Is there mention of something free? 
No     412
Yes     37
Name: count, dtype: int64

In [27]:
ground_truth['Does the ad mention at least one specific product or service (e.g., model, type, item)? '].value_counts()

Does the ad mention at least one specific product or service (e.g., model, type, item)? 
Yes    365
No      84
Name: count, dtype: int64

In [28]:
ground_truth['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
No             286
Yes, both      127
Yes, visual     31
Yes, verbal      5
Name: count, dtype: int64

In [29]:
ground_truth['Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.'].value_counts()

Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.
Yes    358
No      91
Name: count, dtype: int64

In [30]:
ground_truth['Does the ad show the brand or trademark exactly once at the end of the ad?'].value_counts()

Does the ad show the brand or trademark exactly once at the end of the ad?
Yes    339
No     110
Name: count, dtype: int64

In [31]:
ground_truth['Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)'].value_counts()

Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)
Yes    342
No     107
Name: count, dtype: int64

In [32]:
ground_truth['Does the ad give you a positive feeling about the brand? '].value_counts()

Does the ad give you a positive feeling about the brand? 
Yes    352
No      97
Name: count, dtype: int64

In [33]:
ground_truth['Does the ad have a story arc, with a beginning and an end? '].value_counts()

Does the ad have a story arc, with a beginning and an end? 
No     326
Yes    123
Name: count, dtype: int64

In [34]:
ground_truth['Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?'].value_counts()

Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?
No     368
Yes     81
Name: count, dtype: int64

In [35]:
ground_truth['Does the ad have relatable characters? '].value_counts()

Does the ad have relatable characters? 
No     228
Yes    221
Name: count, dtype: int64

In [36]:
ground_truth['Is the ad creative/clever?'].value_counts()

Is the ad creative/clever?
Yes    270
No     179
Name: count, dtype: int64

In [37]:
ground_truth['Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) '].value_counts()

Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) 
No     356
Yes     93
Name: count, dtype: int64

In [38]:
ground_truth['Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? '].value_counts()

Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? 
Yes    252
No     197
Name: count, dtype: int64

In [39]:
ground_truth['Is the ad visually pleasing?'].value_counts()

Is the ad visually pleasing?
Yes    301
No     148
Name: count, dtype: int64

In [40]:
ground_truth['Does the ad have cute elements like animals, babies, animated, characters, etc?'].value_counts()

Does the ad have cute elements like animals, babies, animated, characters, etc?
No     336
Yes    113
Name: count, dtype: int64

In [41]:
# Print the shape (number of rows and columns) of the updated ground truth DataFrame
ground_truth.shape

(449, 22)

In [42]:
# Function to perform majority vote on binary questions
def majority_vote(series):
    counts = series.value_counts()
    if counts.get('Yes', 0) >= counts.get('No', 0):  # Resolving ties in favor of 'Yes'
        return 'Yes'
    else:
        return 'No'

In [43]:
# Function to perform majority vote on the multi-class question
def majority_vote_special(series):
    counts = series.value_counts()
    if counts.get('Yes, both', 0) > max(counts.get('Yes, visual', 0), counts.get('Yes, verbal', 0), counts.get('No', 0)):
        return 'Yes, both'
    elif counts.get('Yes, visual', 0) > max(counts.get('Yes, both', 0), counts.get('Yes, verbal', 0), counts.get('No', 0)):
        return 'Yes, visual'
    elif counts.get('Yes, verbal', 0) > max(counts.get('Yes, both', 0), counts.get('Yes, visual', 0), counts.get('No', 0)):
        return 'Yes, verbal'
    else:
        return 'No'

In [44]:
# Group the ground truth data by 'creative_data_id'
grouped = ground_truth.groupby('creative_data_id')

In [45]:
# Deduce the majority vote for each group and create a new DataFrame
deduced_rows = []
for creative_data_id, group in grouped:
    deduced_row = {'creative_data_id': creative_data_id}
    for column in ground_truth.columns:
        if column != 'creative_data_id':
            if column == 'Is there any verbal or visual mention of the price?':  # Replace with the actual column name
                deduced_row[column] = majority_vote_special(group[column])
            else:
                deduced_row[column] = majority_vote(group[column])
    deduced_rows.append(deduced_row)

In [46]:
# Create a new DataFrame from the deduced rows
deduced_df = pd.DataFrame(deduced_rows)

In [47]:
# Save the deduced DataFrame to a CSV file
deduced_df.to_csv('deduced_ground_truth.csv', index=False)

In [48]:
# Print the shape (number of rows and columns) of the deduced DataFrame
deduced_df.shape

(150, 22)

In [49]:
# Print the column names of the deduced DataFrame
deduced_df.columns

Index(['creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel

In [50]:
# Check the distribution of values in the 'creative_data_id' column of the deduced DataFrame
deduced_df['creative_data_id'].value_counts()

creative_data_id
1471363    1
2592911    1
2597996    1
2612386    1
2620437    1
          ..
2090919    1
2142915    1
2149098    1
2150923    1
3422482    1
Name: count, Length: 150, dtype: int64

In [51]:
# Check the distribution of values in the specific column of the deduced DataFrame
deduced_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
No             102
Yes, both       40
Yes, visual      8
Name: count, dtype: int64

# Part 2: Working with Video Data

# This script processes video data by extracting frames and applying OCR to extract text from each frame.
# It also cleans the textual data to prepare it for further analysis.


In [53]:
# Install necessary libraries for working with videos and text extraction
!pip install torch torchvision torchaudio transformers pytesseract opencv-python-headless
!apt-get install tesseract-ocr

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached

In [54]:
import os
print(os.listdir('.'))

['.config', 'drive', 'deduced_ground_truth.csv', 'Sample.csv', 'groundtruth.csv', 'sample_data']


In [55]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure you have the necessary nltk packages
nltk.download('stopwords')
nltk.download('punkt')

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

# Apply the function to clean the text columns
text_data['cleaned_description'] = text_data['creative_data_description'].apply(clean_text)
text_data['cleaned_speech'] = text_data['speech'].apply(clean_text)

# Display the cleaned text
print("Cleaned Descriptions:")
print(text_data['cleaned_description'].head())
print("\nCleaned Speech Text:")
print(text_data['cleaned_speech'].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Cleaned Descriptions:
0    kim going state farm drive safe save discount ...
1    uncomfortable shabby apartment roommate seems ...
2    flonase guesses wouldnt accept incomplete job ...
3    progressive box starts vlog takes videos amazi...
4    chevrolet owners tell stories silverados taken...
Name: cleaned_description, dtype: object

Cleaned Speech Text:
0    kim going big drive safe save discount yep usi...
1    check credit scores free learn improve credit ...
2    wouldnt accept incomplete job anyone else acce...
3    subword box 30 savings safe drivers coming bra...
4    got scar tissue thing dings truck story happen...
Name: cleaned_speech, dtype: object


In [56]:
import cv2
import pytesseract
import os
import pandas as pd

In [57]:
# Function to extract frames from a video and apply OCR to extract text from each frame
def extract_frames_and_ocr(video_path):
    cap = cv2.VideoCapture(video_path)
    ocr_text = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % 30 == 0:  # Extract one frame per second (assuming 30 FPS)
            text = pytesseract.image_to_string(frame)
            ocr_text.append(text)
        frame_count += 1

    cap.release()
    return ' '.join(ocr_text)

In [58]:
# Define the directory containing the video files
video_dir = '/content/drive/MyDrive/sample'

In [59]:
# Process each video file in the directory
video_data = []
for video_file in os.listdir(video_dir):
    if video_file.endswith('.mp4'):
        video_path = os.path.join(video_dir, video_file)
        video_text = extract_frames_and_ocr(video_path)
        video_id = os.path.splitext(video_file)[0]
        video_data.append({'video_id': video_id, 'video_text': video_text})


In [1]:
# Mount Google Drive to access video files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [60]:
# Create a DataFrame to store the extracted video text data
video_data_df = pd.DataFrame(video_data)

In [61]:
# Display the shape (number of rows and columns) of the video data DataFrame
video_data_df.shape

(150, 2)

In [62]:
# Display the column names of the video data DataFrame
video_data_df.columns

Index(['video_id', 'video_text'], dtype='object')

In [63]:
# Check the distribution of values in the 'video_id' column of the video data DataFrame
video_data_df['video_id'].value_counts()

video_id
2194673    1
3066063    1
2418354    1
2597996    1
3265336    1
          ..
2544459    1
1676138    1
2507887    1
1788954    1
3326009    1
Name: count, Length: 150, dtype: int64

# Part 3: Merging Video and Textual Data

# This script merges the extracted video data with the cleaned textual data,
# applies zero-shot classification to predict answers to predefined questions,
# and evaluates the predictions against deduced ground truth data.

In [65]:

text_data['creative_data_id']

0      2194673
1      2142915
2      1702851
3      1671980
4      1749291
        ...   
145    2755227
146    2259242
147    3124938
148    3264190
149    3326009
Name: creative_data_id, Length: 150, dtype: int64

In [66]:
# Convert the 'video_id' in video_data_df to integer for merging
video_data_df['video_id'] = video_data_df['video_id'].astype(int)

In [67]:
# Convert the 'creative_data_id' in text_data to integer for merging
text_data['creative_data_id'] = text_data['creative_data_id'].astype(int)

In [68]:
# Merge video data with text data on matching IDs
merged_data = pd.merge(video_data_df, text_data, left_on='video_id', right_on='creative_data_id')

In [69]:
# Display the shape of the merged DataFrame
merged_data.shape

(150, 13)

In [70]:
# Display the first few rows of the merged DataFrame
merged_data.head()

Unnamed: 0,video_id,video_text,creative_data_id,creative_data_title,creative_data_description,creative_data_duration,creative_data_lifetime_spend_estimated,creative_data_lifetime_airings_count,creative_data_airing_date_first_et,creative_data_airing_date_last_et,speech,cleaned_description,cleaned_speech
0,2194673,\n \n \n \n \n \n \n \n \n...,2194673,30s Kim's Discount - 2194673,Kim is going for the State Farm Drive Safe & S...,30,29789808.73,13949,2019-04-06T22:19:06-04:00,2020-08-04T18:42:50-04:00,"So Kim, you going for a big drive safe and sav...",kim going state farm drive safe save discount ...,kim going big drive safe save discount yep usi...
1,2142915,bi\n=\n\ncredit karma\n\n \n \n\ncredit ...,2142915,30s New Flat - 2142915,Uncomfortable with her shabby apartment and ro...,30,5423001.7,10132,2019-03-04T06:49:02-05:00,2021-08-03T11:12:36-04:00,Check your credit scores for free and learn ho...,uncomfortable shabby apartment roommate seems ...,check credit scores free learn improve credit ...
2,1671980,"\n \n \n\na P\n=, IR AC eR TU) accidents...",1671980,30s Box Vlog - 1671980,The Progressive Box starts his own vlog as he ...,30,44909836.61,7766,2018-01-01T00:17:22-05:00,2018-06-10T15:53:15-04:00,Subword. It's the box with 30% savings for saf...,progressive box starts vlog takes videos amazi...,subword box 30 savings safe drivers coming bra...
3,1702851,\n\n>\n\n‘te -\n\nom\n«a\n\n \n \n\n- - * o...,1702851,30s Most Pills Don't Finish the Job - 1702851,Flonase guesses you wouldn't accept an incompl...,30,23072716.78,8491,2018-02-19T07:42:14-05:00,2020-06-01T04:22:16-04:00,You wouldn't accept an incomplete job from any...,flonase guesses wouldnt accept incomplete job ...,wouldnt accept incomplete job anyone else acce...
4,2381477,\n \n \n\n5\n4\n8\n§\n~\nA\nH\n3\nA\n4\nE...,2381477,30s Matching Socks - 2381477,GEICO Customer Erin M. switched to GEICO Car I...,30,11445395.8,4530,2019-08-12T03:49:10-04:00,2019-10-11T12:50:11-04:00,I saved hundreds when I switched my car insura...,geico customer erin switched geico car insuran...,saved hundreds switched car insurance geico ma...


In [71]:
# Display the column names of the merged DataFrame
merged_data.columns

Index(['video_id', 'video_text', 'creative_data_id', 'creative_data_title',
       'creative_data_description', 'creative_data_duration',
       'creative_data_lifetime_spend_estimated',
       'creative_data_lifetime_airings_count',
       'creative_data_airing_date_first_et',
       'creative_data_airing_date_last_et', 'speech', 'cleaned_description',
       'cleaned_speech'],
      dtype='object')

In [72]:
# Drop the 'creative_data_id' column as it is redundant after merging
merged_data = merged_data.drop(columns=['creative_data_id'])

In [73]:
# Display the column names of the updated merged DataFrame
merged_data.columns

Index(['video_id', 'video_text', 'creative_data_title',
       'creative_data_description', 'creative_data_duration',
       'creative_data_lifetime_spend_estimated',
       'creative_data_lifetime_airings_count',
       'creative_data_airing_date_first_et',
       'creative_data_airing_date_last_et', 'speech', 'cleaned_description',
       'cleaned_speech'],
      dtype='object')

In [74]:
# Drop the original text columns as we will use the cleaned versions
columns = ['creative_data_description', 'speech']
merged_data = merged_data.drop(columns=columns)

In [75]:
# Display the column names of the updated merged DataFrame
merged_data.columns

Index(['video_id', 'video_text', 'creative_data_title',
       'creative_data_duration', 'creative_data_lifetime_spend_estimated',
       'creative_data_lifetime_airings_count',
       'creative_data_airing_date_first_et',
       'creative_data_airing_date_last_et', 'cleaned_description',
       'cleaned_speech'],
      dtype='object')

In [76]:
# Combine text from video OCR, cleaned description, and cleaned speech into a single column
merged_data['combined_text'] = merged_data.apply(lambda row: ' '.join([
    str(row['video_text']),
    str(row['cleaned_description']),
    str(row['cleaned_speech'])
]), axis=1)

In [77]:
# Drop the individual text columns as they are now combined
columns = ['video_text', 'cleaned_description', 'cleaned_speech']

In [78]:
merged_data = merged_data.drop(columns=columns)

In [79]:
# Display the column names of the updated merged DataFrame
merged_data.columns

Index(['video_id', 'creative_data_title', 'creative_data_duration',
       'creative_data_lifetime_spend_estimated',
       'creative_data_lifetime_airings_count',
       'creative_data_airing_date_first_et',
       'creative_data_airing_date_last_et', 'combined_text'],
      dtype='object')

In [80]:
# Display the first few rows of the combined text column
print(merged_data['combined_text'].head())

0      \n  \n  \n  \n  \n  \n  \n  \n  \n...
1     bi\n=\n\ncredit karma\n\n \n    \n\ncredit ...
2     \n  \n   \n\na P\n=, IR AC eR TU) accidents...
3     \n\n>\n\n‘te -\n\nom\n«a\n\n \n  \n\n- - * o...
4     \n  \n  \n\n5\n4\n8\n§\n~\nA\nH\n3\nA\n4\nE...
Name: combined_text, dtype: object


In [81]:
# Install the necessary libraries for zero-shot classification
!pip install transformers
!pip install torch



In [82]:
# Load the zero-shot classification pipeline
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [83]:
# Define the questions to be used for zero-shot classification
questions = [
       "Is there a call to go online (e.g., shop online, visit the Web)? ",
       "Is there online contact information provided (e.g., URL, website)? ",
       "Is there a visual or verbal call to purchase (e.g., buy now, order now)?",
       "Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ",
       "Is there an incentive to buy (e.g., a discount, a coupon, a sale or \"limited time offer\")? ",
       "Is there offline contact information provided (e.g., phone, mail, store location)?",
       "Is there mention of something free? ",
       "Does the ad mention at least one specific product or service (e.g., model, type, item)? ",
       "Is there any verbal or visual mention of the price?",
       "Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the \"swoosh\" logo prominently displayed on shoes and apparel worn by celebrity athletes. The \"Just Do It\" slogan is another Nike trademark frequently included.",
       "Does the ad show the brand or trademark exactly once at the end of the ad?",
       "Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)",
       "Does the ad give you a positive feeling about the brand? ",
       "Does the ad have a story arc, with a beginning and an end? ",
       "Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?",
       "Does the ad have relatable characters? ", "Is the ad creative/clever?",
       "Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) ",
       "Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? ",
       "Is the ad visually pleasing?",
       "Does the ad have cute elements like animals, babies, animated, characters, etc?"]

In [84]:
# Function to get zero-shot predictions for each question
def get_zero_shot_predictions(text, questions):
    predictions = {}
    for question in questions:
        result = classifier(text, candidate_labels=[question], multi_label=True)
        predictions[question] = 'yes' if result['scores'][0] > 0.5 else 'no'
    return predictions


In [85]:
# Apply the zero-shot predictions to the combined text column
merged_data['predictions'] = merged_data['combined_text'].apply(lambda x: get_zero_shot_predictions(x, questions))

In [86]:
# Display the predictions
merged_data['predictions']

0      {'Is there a call to go online (e.g., shop onl...
1      {'Is there a call to go online (e.g., shop onl...
2      {'Is there a call to go online (e.g., shop onl...
3      {'Is there a call to go online (e.g., shop onl...
4      {'Is there a call to go online (e.g., shop onl...
                             ...                        
145    {'Is there a call to go online (e.g., shop onl...
146    {'Is there a call to go online (e.g., shop onl...
147    {'Is there a call to go online (e.g., shop onl...
148    {'Is there a call to go online (e.g., shop onl...
149    {'Is there a call to go online (e.g., shop onl...
Name: predictions, Length: 150, dtype: object

In [87]:
deduced_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
No             102
Yes, both       40
Yes, visual      8
Name: count, dtype: int64

In [88]:
# Convert the predictions column from dictionary format to DataFrame
predictions_df = pd.DataFrame(merged_data['predictions'].tolist())

In [89]:
# Add the 'creative_data_id' column to the predictions DataFrame
predictions_df['creative_data_id'] = merged_data['video_id']

In [90]:
# Reorder the columns to place 'creative_data_id' at the beginning
columns = ['creative_data_id'] + [col for col in predictions_df if col != 'creative_data_id']
predictions_df = predictions_df[columns]

In [91]:
# Display the column names of the predictions DataFrame
predictions_df.columns


Index(['creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel

In [92]:
# Display the shape of the predictions DataFrame
predictions_df.shape

(150, 22)

In [93]:
# Display the first few rows of the predictions DataFrame
predictions_df.head()

Unnamed: 0,creative_data_id,"Is there a call to go online (e.g., shop online, visit the Web)?","Is there online contact information provided (e.g., URL, website)?","Is there a visual or verbal call to purchase (e.g., buy now, order now)?","Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?","Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?","Is there offline contact information provided (e.g., phone, mail, store location)?",Is there mention of something free?,"Does the ad mention at least one specific product or service (e.g., model, type, item)?",Is there any verbal or visual mention of the price?,...,"Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)",Does the ad give you a positive feeling about the brand?,"Does the ad have a story arc, with a beginning and an end?","Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?",Does the ad have relatable characters?,Is the ad creative/clever?,"Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.)","Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)?",Is the ad visually pleasing?,"Does the ad have cute elements like animals, babies, animated, characters, etc?"
0,2194673,yes,yes,yes,no,yes,yes,yes,yes,yes,...,yes,yes,no,yes,yes,yes,yes,yes,yes,yes
1,2142915,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
2,1671980,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
3,1702851,no,yes,yes,yes,yes,no,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
4,2381477,yes,yes,yes,no,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes


In [94]:
deduced_df.columns[9]

'Is there any verbal or visual mention of the price?'

In [95]:
predictions_df.columns[9]

'Is there any verbal or visual mention of the price?'

In [96]:
predictions_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
yes    104
no      46
Name: count, dtype: int64

In [97]:
# Map the predicted values for 'Is there any verbal or visual mention of the price?' to binary values
deduced_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
No             102
Yes, both       40
Yes, visual      8
Name: count, dtype: int64

In [98]:
# Map the deduced values for 'Is there any verbal or visual mention of the price?' to binary values
deduced_df['Is there any verbal or visual mention of the price?'] = deduced_df['Is there any verbal or visual mention of the price?'].map({
    'No': 0,
    'Yes, visual': 1,
    'Yes, both': 1
})


In [99]:
deduced_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
0    102
1     48
Name: count, dtype: int64

In [100]:
predictions_df['Is there any verbal or visual mention of the price?'] = predictions_df['Is there any verbal or visual mention of the price?'].map({
    'no': 0,
    'yes': 1
})

In [101]:
predictions_df['Is there any verbal or visual mention of the price?'].value_counts()

Is there any verbal or visual mention of the price?
1    104
0     46
Name: count, dtype: int64

In [103]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [104]:
# Extract the true and predicted values for evaluation
y_true = deduced_df['Is there any verbal or visual mention of the price?']
y_pred = predictions_df['Is there any verbal or visual mention of the price?']

# Calculate the metrics
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

# Display the results
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.42105263157894735
Precision: 0.3076923076923077
Recall: 0.6666666666666666
Accuracy: 0.41333333333333333


In [105]:
# Identify the binary columns for metric calculations
binary_columns = deduced_df.columns[:9].tolist() + deduced_df.columns[10:].tolist()

In [106]:
binary_columns

['creative_data_id',
 'Is there a call to go online (e.g., shop online, visit the Web)? ',
 'Is there online contact information provided (e.g., URL, website)? ',
 'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
 'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
 'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
 'Is there offline contact information provided (e.g., phone, mail, store location)?',
 'Is there mention of something free? ',
 'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
 'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.',
 'Does the ad show t

In [107]:
# Remove 'creative_data_id' from the list of binary columns
binary_columns.remove('creative_data_id')

In [108]:
binary_columns

['Is there a call to go online (e.g., shop online, visit the Web)? ',
 'Is there online contact information provided (e.g., URL, website)? ',
 'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
 'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
 'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
 'Is there offline contact information provided (e.g., phone, mail, store location)?',
 'Is there mention of something free? ',
 'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
 'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.',
 'Does the ad show the brand or trademark

In [109]:
# Map binary values to 0 and 1 for the deduced and predicted DataFrames
for column in binary_columns:
    deduced_df[column] = deduced_df[column].replace({'Yes': 1, 'No': 0})
    predictions_df[column] = predictions_df[column].replace({'yes': 1, 'no': 0})

In [110]:
# Initialize lists to store metrics for each question
f1_scores = []
precisions = []
recalls = []
accuracies = []

# Calculate metrics for binary classification columns
for question in binary_columns:
    y_true = deduced_df[question]
    y_pred = predictions_df[question]

    if y_true.nunique() > 2 or y_pred.nunique() > 2:
        print(f"Non-binary values found in question: {question}")
        continue

    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
    precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)

    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)

In [111]:
# Combine results into a DataFrame
metrics_df = pd.DataFrame({
    'Question': binary_columns,
    'F1 Score': f1_scores,
    'Precision': precisions,
    'Recall': recalls,
    'Accuracy': accuracies
})

In [112]:
# Print each question along with its metrics
for index, row in metrics_df.iterrows():
    print(f"Question: {row['Question']}")
    print(f"F1 Score: {row['F1 Score']:.2f}")
    print(f"Precision: {row['Precision']:.2f}")
    print(f"Recall: {row['Recall']:.2f}")
    print(f"Accuracy: {row['Accuracy']:.2f}")
    print("-" * 30)

Question: Is there a call to go online (e.g., shop online, visit the Web)? 
F1 Score: 0.36
Precision: 0.27
Recall: 0.53
Accuracy: 0.46
------------------------------
Question: Is there online contact information provided (e.g., URL, website)? 
F1 Score: 0.58
Precision: 0.48
Recall: 0.74
Accuracy: 0.52
------------------------------
Question: Is there a visual or verbal call to purchase (e.g., buy now, order now)?
F1 Score: 0.50
Precision: 0.36
Recall: 0.84
Accuracy: 0.39
------------------------------
Question: Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? 
F1 Score: 0.39
Precision: 0.29
Recall: 0.57
Accuracy: 0.52
------------------------------
Question: Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? 
F1 Score: 0.57
Precision: 0.45
Recall: 0.77
Accuracy: 0.49
------------------------------
Question: Is there offline contact information provided (e.g., phone, mail, store location)?
F1 Score: 0

In [113]:
predictions_df.columns

Index(['creative_data_id',
       'Is there a call to go online (e.g., shop online, visit the Web)? ',
       'Is there online contact information provided (e.g., URL, website)? ',
       'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
       'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ',
       'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ',
       'Is there offline contact information provided (e.g., phone, mail, store location)?',
       'Is there mention of something free? ',
       'Does the ad mention at least one specific product or service (e.g., model, type, item)? ',
       'Is there any verbal or visual mention of the price?',
       'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel

In [114]:
# Map 1 to 'yes' and 0 to 'no' for all columns except video_id
predictions_mapped_df = predictions_df.copy()
for column in predictions_mapped_df.columns:
    if column != 'creative_data_id':
        predictions_mapped_df[column] = predictions_mapped_df[column].map({1: 'yes', 0: 'no'})

# Save the mapped predictions dataframe to a CSV file
predictions_mapped_df.to_csv('predictions_mapped.csv', index=False)

print("CSV file 'predictions_mapped.csv' has been created with yes/no responses.")


CSV file 'predictions_mapped.csv' has been created with yes/no responses.
