In [4]:
import pandas as pd
import re
import numpy as np
import random
import matplotlib.pyplot as plt
from transformers import pipeline

In [None]:
def load_metadata_videos(file_path):
    return pd.read_csv(file_path).drop(columns='Unnamed: 0').dropna()

def bart_classification(text, candidate_labels, multi_label = True, plot=False):
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    result = classifier(text, candidate_labels, multi_label=multi_label)
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)

    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([i for i, score in enumerate(scores) if score >= threshold])

    if plot: plot_scores(scores, labels)

    if max_score < 0.3: return ["misc"]
    elif top_count == 1: return [labels[0]]
    elif top_count == 2 and multi_label: return [labels[0], labels[1]]
    elif top_count == 3 and multi_label: return [labels[0], labels[1], labels[2]]
    else: return ["uncertain"]


def plot_scores(scores, labels):

    # Identify the index of the maximum score
    max_index = scores.index(max(scores))

    # Define colors: green for the max score, grey for others
    colors = ['green' if i == max_index else 'grey' for i in range(len(labels))]

    # Create the bar chart
    plt.figure(figsize=(10, 4))
    bars = plt.bar(labels, scores, color=colors)

    # Add score labels above each bar
    for bar, score in zip(bars, scores):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.01,  # Slightly above the bar
            f'{score:.2f}',
            ha='center',
            va='bottom',
            fontsize=10
        )

    # Add titles and labels
    plt.title('Probability of Each Label for the Video', fontsize=14)
    plt.xlabel('Labels', fontsize=12)
    plt.ylabel('Probability', fontsize=12)
    plt.ylim(0, max(scores) + 0.1)  # Add some space on top for labels

    # Optional: Rotate x-axis labels if they are too long
    plt.xticks(rotation=20, ha='right')

    # Show grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Display the plot
    plt.tight_layout()
    plt.show()

In [54]:
file_path = r"C:\Users\Flori\Docs\Python\M2_S3_ADA\Project - data\Education_videos_5.csv"
df_education = load_metadata_videos(file_path)

In [83]:
purpose_labels = [
    "lecture or academic course", #exercise
    #"study-tips or test preparation",
    "hacks", 
    "conference",
    "tutorial or DIY",
    "interview or Q&A or review", #FIND BETTER
    "kids content",
    "entertaining explanation or science popularization",
    "documentary" #research based
]

level_labels = [
    "beginner",
    "intermediate",
    "advanced",
]

content_labels = [
    "science or technology",
    "music or art",
    "photography or videography or filmaking",
    "gaming",
    "chess or puzzles or logic", #riddles
    "religion or spirituality",
    "phylosophy or ethics",
    "history or politics",
    "economics or business",
    "financial education",
    "cryptocurrency",
    "food or cooking",
    "sport",
    "health or medicine",
    "travel",
    "motivational or personal development"
    "home repair or renovation",  
    "beauty or fashion",
    "programming tools or coding",
    "foreign language",
    "sociology or culture",
    "psychology",
    "climate or environment",
    "wildlife or animals or nature" #segment?
]

print('################################################')
random.seed(352)
for i in range(10):
    row = random.choice(df_education.index.to_list())
    title = df_education.loc[row, 'title']
    tags = df_education.loc[row, 'tags']
    combined_text = f"{title} {tags.replace(',', ', ')}"
    print('Row:', row)
    print('Title:', title)
    print('Tags:', tags)
    purpose = bart_classification(combined_text, purpose_labels, multi_label=True, plot=False)
    print("--> Purpose:", purpose)
    level = bart_classification(combined_text, level_labels, multi_label=False, plot=False)
    print("--> Level:  ", level)
    content = bart_classification(combined_text, content_labels, multi_label=True, plot=False)
    print("--> Content:", content)
    print('################################################')

################################################
Row: 185983
Title: Singular and Non-Singular Matrices
Tags: singular matrix,non-singular matrix,matrix,matrices,matrices class 12,matrices engineering mathematics,matrices and determinants,matrices class 11,matrices for gate,matrices iit jee,matrices videos,matrices video lectures
--> Purpose: ['lecture or academic course']
--> Level:   ['advanced']
--> Content: ['science or technology']
################################################
Row: 88835
Title: Wednesday 11/26: Plastic Surgery to Prevent Infidelity? Teen Girl Impaled by Deer Antlers - Promo
Tags: the drs,Travis,ordon,Ross,Dr. Rachael,Health,Dr. Berman,Advice,mental,Physical,doctors,Jennifer,wellness,Daytime,Medicine,Berman,dr. ashton,Ordon,TV,tips,Mental,Recipe,travis,Investigate,dr. travis,dr. ordon,Andrew,the doctors cbs,Doctors,Dr. Ashton,Jim,dr. berman,The Doctors,Tips,Dr. Travis,dr. rachael,News,Dr. Sears,Stork,Beauty,Show,Talk,Rachael,Wellness,Ashton,health,advice,healthy,