In [1]:
import pandas as pd
import re
import numpy as np
import random
import matplotlib.pyplot as plt
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Example: moving a tensor to the MPS device
tensor = torch.randn(2, 2).to(device)
print(tensor)

In [2]:
def load_metadata_videos(file_path):
    return pd.read_csv(file_path).drop(columns='Unnamed: 0').dropna()

def bart_classification(text, candidate_labels, multi_label=True, plot=False):
    # Choose device based on availability (Apple Silicon or CUDA)
    if torch.backends.mps.is_available():  # Apple Silicon GPUs
        device = "mps"
    elif torch.cuda.is_available():        # Other supported GPUs (CUDA)
        device = "cuda"
    else:
        device = "cpu"

    # Initialize the zero-shot classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)

    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([i for i, score in enumerate(scores) if score >= threshold])

    if plot:
        plot_scores(scores, labels)

    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

def plot_scores(scores, labels):
    # Identify the index of the maximum score
    max_index = scores.index(max(scores))

    # Define colors: green for the max score, grey for others
    colors = ['green' if i == max_index else 'grey' for i in range(len(labels))]

    # Create the bar chart
    plt.figure(figsize=(10, 4))
    bars = plt.bar(labels, scores, color=colors)

    # Add score labels above each bar
    for bar, score in zip(bars, scores):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.01,  # Slightly above the bar
            f'{score:.2f}',
            ha='center',
            va='bottom',
            fontsize=10
        )

    # Add titles and labels
    plt.title('Probability of Each Label for the Video', fontsize=14)
    plt.xlabel('Labels', fontsize=12)
    plt.ylabel('Probability', fontsize=12)
    plt.ylim(0, max(scores) + 0.1)  # Add some space on top for labels

    # Optional: Rotate x-axis labels if they are too long
    plt.xticks(rotation=20, ha='right')

    # Show grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Display the plot
    plt.tight_layout()
    plt.show()


In [3]:
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

In [6]:
purpose_labels = [
    "lecture or academic course", #exercise
    #"study-tips or test preparation",
    "hacks", 
    "conference",
    "tutorial or DIY",
    "interview or Q&A or review", #FIND BETTER
    "kids content",
    "entertaining explanation or science popularization",
    "documentary" #research based
]

level_labels = [
    "beginner",
    "intermediate",
    "advanced",
]

content_labels = [
    "science or technology",
    "music or art",
    "photography or videography or filmaking",
    "gaming",
    "chess or puzzles or logic", #riddles
    "religion or spirituality",
    "phylosophy or ethics",
    "history or politics",
    "economics or business",
    "financial education",
    "cryptocurrency",
    "food or cooking",
    "sport",
    "health or medicine",
    "travel",
    "motivational or personal development",
    "home repair or renovation",  
    "beauty or fashion",
    "programming tools or coding",
    "foreign language",
    "sociology or culture",
    "psychology",
    "climate or environment",
    "wildlife or animals or nature" #segment?
]
"""
print('################################################')
random.seed(352)
for i in range(10):
    row = random.choice(df_education.index.to_list())
    title = df_education.loc[row, 'title']
    tags = df_education.loc[row, 'tags']
    combined_text = f"{title} {tags.replace(',', ', ')}"
    print('Row:', row)
    print('Title:', title)
    print('Tags:', tags)
    purpose = bart_classification(combined_text, purpose_labels, multi_label=True, plot=False)
    print("--> Purpose:", purpose)
    level = bart_classification(combined_text, level_labels, multi_label=False, plot=False)
    print("--> Level:  ", level)
    content = bart_classification(combined_text, content_labels, multi_label=True, plot=False)
    print("--> Content:", content)
    print('################################################')
"""

'\nprint(\'################################################\')\nrandom.seed(352)\nfor i in range(10):\n    row = random.choice(df_education.index.to_list())\n    title = df_education.loc[row, \'title\']\n    tags = df_education.loc[row, \'tags\']\n    combined_text = f"{title} {tags.replace(\',\', \', \')}"\n    print(\'Row:\', row)\n    print(\'Title:\', title)\n    print(\'Tags:\', tags)\n    purpose = bart_classification(combined_text, purpose_labels, multi_label=True, plot=False)\n    print("--> Purpose:", purpose)\n    level = bart_classification(combined_text, level_labels, multi_label=False, plot=False)\n    print("--> Level:  ", level)\n    content = bart_classification(combined_text, content_labels, multi_label=True, plot=False)\n    print("--> Content:", content)\n    print(\'################################################\')\n'

In [9]:
def classify_row(row):
    index = row.name
    
    if index % 1 == 0:
        print(f"Processing row {index}:")
        print(row)
    
    combined_text = f"{row['title']} {row['tags'].replace(',', ', ')}"
    purpose = bart_classification(combined_text, purpose_labels, multi_label=True, plot=False)
    level = bart_classification(combined_text, level_labels, multi_label=False, plot=False)
    content = bart_classification(combined_text, content_labels, multi_label=True, plot=False)
    
    return pd.Series({'purpose': purpose, 'level': level, 'content': content})

df_education[['purpose', 'level', 'content']] = df_education.apply(classify_row, axis=1)

print("Classification complete.")

file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(file_path, index=False)

print(f"Data saved to {file_path}.")

def classify_broad _dataset (batch, categories, on, classifier):
texts = batch[on]


broad_categories_labels = list (categories.keys())

results = classifier (texts, broad _categories_labels)

batch["broad _category"] = [res["labels"][©] for res in results]

batch "broad _confidence"] = [res["scores"][0] for res in results]
return batch

def classify_broad(data, categories, on, classifier, batch _size-16):
print ("Converting to dataset...")
dataset = Dataset. from _pandas(data)
print("Processing...")
dataset = dataset.map
classify_broad _dataset, batched-True, batch_size=batch_size,
fn_kwargs-("classifier": classifier, "categories": categories, "on": on},
print("Converting back to dataframe...")
Final data - dataset. to pandas return final data


Processing row 0:
categories                                               Education
channel_id                                UC4avQL0dz2oTQtJqXyVfe8g
crawl_date                              2019-11-17 12:43:19.304589
description      http://bit.ly/km-Kabbalah-Course / Excerpt fro...
dislike_count                                                  0.0
display_id                                             oG4rYTF7WYQ
duration                                                        70
like_count                                                     8.0
tags             daily,kabbalah,lesson,moments,michael,laitman,...
title            That's It We Are Ready To Bond - Kabbalah Mome...
upload_date                                    2010-11-17 00:00:00
view_count                                                   444.0
Name: 0, dtype: object
Processing row 1:
categories                                               Education
channel_id                                UC4avQL0dz2oTQtJqXyVfe8g
cra

KeyboardInterrupt: 

In [11]:
from datasets import Dataset

# Initialize BART classification pipeline (Assume classifier is a Hugging Face pipeline)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define categories
purpose_labels = [
    "lecture or academic course", #exercise
    #"study-tips or test preparation",
    "hacks", 
    "conference",
    "tutorial or DIY",
    "interview or Q&A or review", #FIND BETTER
    "kids content",
    "entertaining explanation or science popularization",
    "documentary" #research based
]

level_labels = [
    "beginner",
    "intermediate",
    "advanced",
]

content_labels = [
    "science or technology",
    "music or art",
    "photography or videography or filmaking",
    "gaming",
    "chess or puzzles or logic", #riddles
    "religion or spirituality",
    "phylosophy or ethics",
    "history or politics",
    "economics or business",
    "financial education",
    "cryptocurrency",
    "food or cooking",
    "sport",
    "health or medicine",
    "travel",
    "motivational or personal development",
    "home repair or renovation",  
    "beauty or fashion",
    "programming tools or coding",
    "foreign language",
    "sociology or culture",
    "psychology",
    "climate or environment",
    "wildlife or animals or nature" #segment?
]
# Custom function to classify each row in a batch
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Extract and structure classification results
    purposes = [res['labels'] for res in purpose_results]
    levels = [res['labels'][0] for res in level_results]  # single label, take first
    contents = [res['labels'] for res in content_results]

    batch['purpose'] = purposes
    batch['level'] = levels
    batch['content'] = contents
    
    return batch

# Process function to handle batching in `datasets`
def classify_data(data, classifier, batch_size=32):
    print("Converting to Dataset...")
    dataset = Dataset.from_pandas(data)
    
    print("Processing in batches...")
    dataset = dataset.map(
        lambda batch: classify_row_batch(
            batch, classifier, purpose_labels, level_labels, content_labels
        ),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load data
df_education = pd.read_csv("data/Education_videos_7.csv")

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save results
file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(file_path, index=False)
print(f"Data saved to {file_path}.")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column Unnamed: 0 with type int64')

In [12]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Function to load and preprocess metadata
def load_metadata_videos(file_path):
    # Load CSV, drop unnecessary column, and remove rows with missing data
    return pd.read_csv(file_path).drop(columns=['Unnamed: 0'], errors='ignore').dropna()

# Function to classify text using BART and zero-shot classification
def bart_classification(text, candidate_labels, multi_label=True, plot=False):
    # Determine device (Apple Silicon or CUDA for GPU if available, else CPU)
    if torch.backends.mps.is_available():
        device = "mps"  # Apple Silicon GPU
    elif torch.cuda.is_available():
        device = "cuda"  # CUDA GPU
    else:
        device = "cpu"

    # Initialize the classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    # Sort results by score
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)

    # Thresholding logic to select top labels based on scores
    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([score for score in scores if score >= threshold])

    if plot:
        plot_scores(scores, labels)

    # Classification decisions based on scores and multi-label setting
    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

# Define labels for different categories
purpose_labels = [
    "lecture or academic course", "hacks", "conference", "tutorial or DIY",
    "interview or Q&A or review", "kids content", 
    "entertaining explanation or science popularization", "documentary"
]

level_labels = [
    "beginner", "intermediate", "advanced"
]

content_labels = [
    "science or technology", "music or art", "photography or videography or filmaking",
    "gaming", "chess or puzzles or logic", "religion or spirituality",
    "philosophy or ethics", "history or politics", "economics or business",
    "financial education", "cryptocurrency", "food or cooking", "sport",
    "health or medicine", "travel", "motivational or personal development",
    "home repair or renovation", "beauty or fashion", "programming tools or coding",
    "foreign language", "sociology or culture", "psychology", "climate or environment",
    "wildlife or animals or nature"
]

# Function to classify a batch of rows
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    # Classify texts for each category
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Extract labels and store results
    purposes = [res['labels'] for res in purpose_results]
    levels = [res['labels'][0] for res in level_results]  # Single label
    contents = [res['labels'] for res in content_results]

    batch['purpose'] = purposes
    batch['level'] = levels
    batch['content'] = contents
    
    return batch

# Function to handle classification on the entire dataset in batches
def classify_data(data, classifier, batch_size=32):
    print("Converting to Dataset...")
    dataset = Dataset.from_pandas(data)
    
    print("Processing in batches...")
    dataset = dataset.map(
        lambda batch: classify_row_batch(
            batch, classifier, purpose_labels, level_labels, content_labels
        ),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column duration with type int64')

In [13]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Function to load and preprocess metadata
def load_metadata_videos(file_path):
    # Load CSV, drop unnecessary column, and remove rows with missing data
    return pd.read_csv(file_path).drop(columns=['Unnamed: 0'], errors='ignore').dropna()

# Function to classify text using BART and zero-shot classification
def bart_classification(text, candidate_labels, multi_label=True, plot=False):
    # Determine device (Apple Silicon or CUDA for GPU if available, else CPU)
    device = 0 if torch.cuda.is_available() else -1  # CUDA for GPU if available, else CPU

    # Initialize the classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    # Sort results by score
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)

    # Thresholding logic to select top labels based on scores
    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([score for score in scores if score >= threshold])

    # Classification decisions based on scores and multi-label setting
    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

# Define labels for different categories
purpose_labels = [
    "lecture or academic course", "hacks", "conference", "tutorial or DIY",
    "interview or Q&A or review", "kids content", 
    "entertaining explanation or science popularization", "documentary"
]

level_labels = [
    "beginner", "intermediate", "advanced"
]

content_labels = [
    "science or technology", "music or art", "photography or videography or filmmaking",
    "gaming", "chess or puzzles or logic", "religion or spirituality",
    "philosophy or ethics", "history or politics", "economics or business",
    "financial education", "cryptocurrency", "food or cooking", "sport",
    "health or medicine", "travel", "motivational or personal development",
    "home repair or renovation", "beauty or fashion", "programming tools or coding",
    "foreign language", "sociology or culture", "psychology", "climate or environment",
    "wildlife or animals or nature"
]

# Function to classify a batch of rows
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    # Classify texts for each category
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Extract labels and store results
    purposes = [res['labels'] for res in purpose_results]
    levels = [res['labels'][0] for res in level_results]  # Single label
    contents = [res['labels'] for res in content_results]

    batch['purpose'] = purposes
    batch['level'] = levels
    batch['content'] = contents
    
    return batch

# Function to handle classification on the entire dataset in batches
def classify_data(data, classifier, batch_size=32):
    # Check for dtype issues and convert incompatible columns
    for col in data.select_dtypes(include=['int64', 'float64']).columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    print("Converting to Dataset...")
    dataset = Dataset.from_pandas(data)
    
    print("Processing in batches...")
    dataset = dataset.map(
        lambda batch: classify_row_batch(
            batch, classifier, purpose_labels, level_labels, content_labels
        ),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline
device = 0 if torch.cuda.is_available() else -1  # Ensure GPU use if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column duration with type int64')

In [14]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Load CSV and drop unnamed or null columns
def load_metadata_videos(file_path):
    df = pd.read_csv(file_path).dropna()
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Drop unnamed columns
    return df

# Check data types and convert incompatible types for `Dataset`
def clean_data_types(df):
    # Convert integer columns to `float` to avoid Arrow errors
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            df[col] = df[col].astype('float')
        elif pd.api.types.is_object_dtype(df[col]):
            df[col] = df[col].astype('string')  # Ensure all text columns are strings
    return df

# Function to classify text using BART and zero-shot classification
def bart_classification(text, candidate_labels, multi_label=True):
    device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

    # Initialize the classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    # Sort and threshold scores
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)
    
    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([score for score in scores if score >= threshold])

    # Classification logic based on score thresholds
    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

# Define labels for different categories
purpose_labels = ["lecture or academic course", "hacks", "conference", "tutorial or DIY",
                  "interview or Q&A or review", "kids content", "entertaining explanation or science popularization",
                  "documentary"]
level_labels = ["beginner", "intermediate", "advanced"]
content_labels = ["science or technology", "music or art", "photography or videography or filmmaking", "gaming",
                  "chess or puzzles or logic", "religion or spirituality", "philosophy or ethics", "history or politics",
                  "economics or business", "financial education", "cryptocurrency", "food or cooking", "sport",
                  "health or medicine", "travel", "motivational or personal development", "home repair or renovation",
                  "beauty or fashion", "programming tools or coding", "foreign language", "sociology or culture",
                  "psychology", "climate or environment", "wildlife or animals or nature"]

# Classify each batch row by row
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Store results
    batch['purpose'] = [res['labels'] for res in purpose_results]
    batch['level'] = [res['labels'][0] for res in level_results]  # single label
    batch['content'] = [res['labels'] for res in content_results]
    
    return batch

# Full classification process with data type cleaning
def classify_data(data, classifier, batch_size=32):
    # Clean data types for compatibility with Dataset
    data = clean_data_types(data)
    
    print("Converting to Dataset...")
    dataset = Dataset.from_pandas(data)  # Should now handle all columns
    print("Processing in batches...")
    
    # Apply classification in batches
    dataset = dataset.map(
        lambda batch: classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline with device specified
device = 0 if torch.cuda.is_available() else -1  # Ensure GPU use if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column None with type int64')

In [15]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Load CSV and drop unnamed or null columns
def load_metadata_videos(file_path):
    df = pd.read_csv(file_path).dropna()
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Drop unnamed columns
    return df

# Drop unnecessary columns to avoid Arrow errors (like `duration`)
def preprocess_data(df, columns_to_drop):
    df = df.drop(columns=columns_to_drop, errors='ignore')  # Drop specified columns if they exist
    return df

# Function to classify text using BART and zero-shot classification
def bart_classification(text, candidate_labels, multi_label=True):
    device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

    # Initialize the classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    # Sort and threshold scores
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)
    
    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([score for score in scores if score >= threshold])

    # Classification logic based on score thresholds
    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

# Define labels for different categories
purpose_labels = ["lecture or academic course", "hacks", "conference", "tutorial or DIY",
                  "interview or Q&A or review", "kids content", "entertaining explanation or science popularization",
                  "documentary"]
level_labels = ["beginner", "intermediate", "advanced"]
content_labels = ["science or technology", "music or art", "photography or videography or filmmaking", "gaming",
                  "chess or puzzles or logic", "religion or spirituality", "philosophy or ethics", "history or politics",
                  "economics or business", "financial education", "cryptocurrency", "food or cooking", "sport",
                  "health or medicine", "travel", "motivational or personal development", "home repair or renovation",
                  "beauty or fashion", "programming tools or coding", "foreign language", "sociology or culture",
                  "psychology", "climate or environment", "wildlife or animals or nature"]

# Classify each batch row by row
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Store results
    batch['purpose'] = [res['labels'] for res in purpose_results]
    batch['level'] = [res['labels'][0] for res in level_results]  # single label
    batch['content'] = [res['labels'] for res in content_results]
    
    return batch

# Full classification process with data type cleaning
def classify_data(data, classifier, batch_size=32):
    print("Dropping unnecessary columns and converting to Dataset...")
    data = preprocess_data(data, columns_to_drop=['duration'])  # Drop `duration` or any other columns causing errors
    
    # Convert to Dataset
    dataset = Dataset.from_pandas(data)
    print("Processing in batches...")
    
    # Apply classification in batches
    dataset = dataset.map(
        lambda batch: classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline with device specified
device = 0 if torch.cuda.is_available() else -1  # Ensure GPU use if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Dropping unnecessary columns and converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column None with type int64')

In [16]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Load CSV and select only necessary columns
def load_metadata_videos(file_path):
    df = pd.read_csv(file_path, usecols=['title', 'tags']).dropna()  # Only load 'title' and 'tags' columns
    return df

# Function to classify text using BART and zero-shot classification
def bart_classification(text, candidate_labels, multi_label=True):
    device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

    # Initialize the classifier with GPU support
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
    result = classifier(text, candidate_labels, multi_label=multi_label)
    
    # Sort and threshold scores
    scores, labels = result['scores'], result['labels']
    sorted_pairs = sorted(zip(scores, labels), key=lambda x: x[0], reverse=True)
    scores, labels = zip(*sorted_pairs)
    
    max_score = scores[0]
    threshold = max_score * 0.9
    top_count = len([score for score in scores if score >= threshold])

    # Classification logic based on score thresholds
    if max_score < 0.3:
        return ["misc"]
    elif top_count == 1:
        return [labels[0]]
    elif top_count == 2 and multi_label:
        return [labels[0], labels[1]]
    elif top_count == 3 and multi_label:
        return [labels[0], labels[1], labels[2]]
    else:
        return ["uncertain"]

# Define labels for different categories
purpose_labels = ["lecture or academic course", "hacks", "conference", "tutorial or DIY",
                  "interview or Q&A or review", "kids content", "entertaining explanation or science popularization",
                  "documentary"]
level_labels = ["beginner", "intermediate", "advanced"]
content_labels = ["science or technology", "music or art", "photography or videography or filmmaking", "gaming",
                  "chess or puzzles or logic", "religion or spirituality", "philosophy or ethics", "history or politics",
                  "economics or business", "financial education", "cryptocurrency", "food or cooking", "sport",
                  "health or medicine", "travel", "motivational or personal development", "home repair or renovation",
                  "beauty or fashion", "programming tools or coding", "foreign language", "sociology or culture",
                  "psychology", "climate or environment", "wildlife or animals or nature"]

# Classify each batch row by row
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Store results
    batch['purpose'] = [res['labels'] for res in purpose_results]
    batch['level'] = [res['labels'][0] for res in level_results]  # single label
    batch['content'] = [res['labels'] for res in content_results]
    
    return batch

# Full classification process with only selected columns
def classify_data(data, classifier, batch_size=32):
    print("Converting to Dataset...")
    
    # Select only 'title' and 'tags' columns and convert to Dataset
    dataset = Dataset.from_pandas(data[['title', 'tags']])
    print("Processing in batches...")
    
    # Apply classification in batches
    dataset = dataset.map(
        lambda batch: classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline with device specified
device = 0 if torch.cuda.is_available() else -1  # Ensure GPU use if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column None with type int64')

In [17]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset

# Load only the necessary columns ('title' and 'tags') and ensure they are strings
def load_metadata_videos(file_path):
    df = pd.read_csv(file_path, usecols=['title', 'tags']).dropna()
    df['title'] = df['title'].astype(str)  # Convert to string to avoid type issues
    df['tags'] = df['tags'].astype(str)    # Convert to string to avoid type issues
    return df

# Define labels for different categories
purpose_labels = ["lecture or academic course", "hacks", "conference", "tutorial or DIY",
                  "interview or Q&A or review", "kids content", "entertaining explanation or science popularization",
                  "documentary"]
level_labels = ["beginner", "intermediate", "advanced"]
content_labels = ["science or technology", "music or art", "photography or videography or filmmaking", "gaming",
                  "chess or puzzles or logic", "religion or spirituality", "philosophy or ethics", "history or politics",
                  "economics or business", "financial education", "cryptocurrency", "food or cooking", "sport",
                  "health or medicine", "travel", "motivational or personal development", "home repair or renovation",
                  "beauty or fashion", "programming tools or coding", "foreign language", "sociology or culture",
                  "psychology", "climate or environment", "wildlife or animals or nature"]

# Classify each row in batches
def classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels):
    texts = [f"{title} {tags.replace(',', ', ')}" for title, tags in zip(batch['title'], batch['tags'])]
    
    purpose_results = classifier(texts, purpose_labels, multi_label=True)
    level_results = classifier(texts, level_labels, multi_label=False)
    content_results = classifier(texts, content_labels, multi_label=True)

    # Store results
    batch['purpose'] = [res['labels'] for res in purpose_results]
    batch['level'] = [res['labels'][0] for res in level_results]  # single label
    batch['content'] = [res['labels'] for res in content_results]
    
    return batch

# Full classification process with only selected columns
def classify_data(data, classifier, batch_size=32):
    print("Converting to Dataset...")
    
    # Select only 'title' and 'tags' columns and convert to Dataset
    dataset = Dataset.from_pandas(data[['title', 'tags']])
    print("Processing in batches...")
    
    # Apply classification in batches
    dataset = dataset.map(
        lambda batch: classify_row_batch(batch, classifier, purpose_labels, level_labels, content_labels),
        batched=True,
        batch_size=batch_size
    )
    
    print("Converting back to DataFrame...")
    return dataset.to_pandas()

# Load and preprocess data
file_path = "data/Education_videos_7.csv"
df_education = load_metadata_videos(file_path)

# Initialize classifier pipeline with device specified
device = 0 if torch.cuda.is_available() else -1  # Ensure GPU use if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Run classification
df_education = classify_data(df_education, classifier, batch_size=32)

# Save the classified data
output_file_path = "data/Education_videos_7_BART.csv"
df_education.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}.")


Converting to Dataset...


ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column None with type int64')

In [18]:
import pandas as pd
from datasets import Dataset
from transformers import pipeline

# Read in the data
df_education = pd.read_csv("data/Education_videos_7.csv")

# Clean the dataset (remove unnecessary columns if needed)
df_education = df_education.drop(columns=['Unnamed: 0', 'duration'], errors='ignore')

# Ensure text columns are strings
df_education['title'] = df_education['title'].astype(str)
df_education['tags'] = df_education['tags'].astype(str)

# Convert the pandas DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df_education)

# Instantiate the zero-shot classification model
device = 0  # Set to 0 if using GPU, or -1 for CPU
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Define label sets for classification
purpose_labels = ['education', 'entertainment', 'news', 'tutorial']
level_labels = ['beginner', 'intermediate', 'advanced']
content_labels = ['theory', 'practical', 'case_study', 'review']

# Function to classify a batch of data
def classify_batch(batch):
    # Combine the title and tags for classification
    combined_texts = [title + " " + tags for title, tags in zip(batch['title'], batch['tags'])]
    
    # Perform classification for each label in batch
    purpose_results = classifier(combined_texts, candidate_labels=purpose_labels)
    level_results = classifier(combined_texts, candidate_labels=level_labels)
    content_results = classifier(combined_texts, candidate_labels=content_labels)
    
    # Extract the top label for each classification
    batch['purpose'] = [result['labels'][0] for result in purpose_results]
    batch['level'] = [result['labels'][0] for result in level_results]
    batch['content'] = [result['labels'][0] for result in content_results]
    
    return batch

# Use the map function to apply classify_batch in parallel (default batch_size is large enough for parallel processing)
dataset = dataset.map(classify_batch, batched=True)

# Convert the dataset back to a pandas DataFrame
df_classified = dataset.to_pandas()

# Save the final result to a CSV file
output_file_path = "data/Education_videos_7_BART_map.csv"
df_classified.to_csv(output_file_path, index=False)

print(f"Batch classification complete. Results saved to {output_file_path}")


ArrowTypeError: ('Input object was not a NumPy array', 'Conversion failed for column categories with type object')

In [19]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import pipeline

# Read in the data
df_education = pd.read_csv("data/Education_videos_7.csv")

# Clean the dataset (remove unnecessary columns if needed)
df_education = df_education.drop(columns=['Unnamed: 0', 'duration'], errors='ignore')

# Ensure text columns are strings
df_education['title'] = df_education['title'].astype(str)
df_education['tags'] = df_education['tags'].astype(str)

# Ensure there are no complex or mixed data types
df_education['tags'] = df_education['tags'].apply(lambda x: str(x))

# Convert the pandas DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df_education)

# Instantiate the zero-shot classification model
device = 0  # Set to 0 if using GPU, or -1 for CPU
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Define label sets for classification
purpose_labels = ['education', 'entertainment', 'news', 'tutorial']
level_labels = ['beginner', 'intermediate', 'advanced']
content_labels = ['theory', 'practical', 'case_study', 'review']

# Function to classify a batch of data
def classify_batch(batch):
    # Combine the title and tags for classification
    combined_texts = [title + " " + tags for title, tags in zip(batch['title'], batch['tags'])]
    
    # Perform classification for each label in batch
    purpose_results = classifier(combined_texts, candidate_labels=purpose_labels)
    level_results = classifier(combined_texts, candidate_labels=level_labels)
    content_results = classifier(combined_texts, candidate_labels=content_labels)
    
    # Extract the top label for each classification
    batch['purpose'] = [result['labels'][0] for result in purpose_results]
    batch['level'] = [result['labels'][0] for result in level_results]
    batch['content'] = [result['labels'][0] for result in content_results]
    
    return batch

# Use the map function to apply classify_batch in parallel (default batch_size is large enough for parallel processing)
dataset = dataset.map(classify_batch, batched=True)

# Convert the dataset back to a pandas DataFrame
df_classified = dataset.to_pandas()

# Save the final result to a CSV file
output_file_path = "data/Education_videos_7_BART_map.csv"
df_classified.to_csv(output_file_path, index=False)

print(f"Batch classification complete. Results saved to {output_file_path}")


ArrowTypeError: ('Input object was not a NumPy array', 'Conversion failed for column categories with type object')