In [1]:
import re
import pandas as pd
import numpy as np
from transformers import pipeline

DATA_FILE = 'data/yt_metadata.jsonl.gz'
MODEL_PATH = 'src/models/zero-shot-classification'

In [2]:
def process_data(file_path, chunk_size, preprocess_func, output_path):
    """
    Process a JSONL file in chunks and apply a preprocessing function to each chunk.

    Args:
        file_path (str): Path to the gzipped JSONL file.
        chunk_size (int): Number of rows to process per chunk.
        preprocess_func (callable): Function to apply to each chunk of data (Pandas DataFrame).
        output_path (str): Path to store the processed data.

    Returns:
        None
    """
    with pd.read_json(file_path, lines=True, 
                      compression="gzip", 
                      chunksize=chunk_size) as reader, open(output_path, 'w') as outfile:
        for chunk_df in reader:
            # Apply preprocessing function to the chunk
            processed_df = preprocess_func(chunk_df)
            # Append the processed chunk to the output file
            processed_df.to_json(outfile, orient="records", lines=True)

In [3]:
def detect_collaboration(text):
    """
    Detect if the text indicates a collaboration.

    Args:
        text (str): Text to analyze.

    Returns:
        bool: True if collaboration is detected, False otherwise.
    """
    # Define collaboration indicators
    collaboration_patterns = [
        r'\bfeat\b', r'\bft\b', r'\bfeaturing\b', r'\bx\b', r'\+',
    ]
    # Combine patterns into a single regex pattern
    pattern = re.compile('|'.join(collaboration_patterns), flags=re.IGNORECASE)
    return bool(pattern.search(text))

In [4]:
def preprocess_collaborations(chunk_df):
    """
    Preprocess a chunk of data

    Args:
        chunk_df (pd.DataFrame): Chunk of data

    Returns:
        pd.DataFrame: Processed data
    """    
    # Drop rows with missing values
    chunk_df = chunk_df.dropna(how='any')
    
    # Only keep Music and Entertainment categories with selected columns
    chunk_df = chunk_df[chunk_df['categories'].isin(['Music','Entertainment'])]
    columns_to_keep = ['categories', 'title', 'description', 'tags', 'view_count',  'like_count', 'dislike_count', 'channel_id',]
    chunk_df = chunk_df[columns_to_keep]
    
    # Only keep rows where the title indicates a collaboration
    # chunk_df = chunk_df[chunk_df['tags'].apply(detect_collaboration)]
    
    return chunk_df

In [5]:
# Test the data processing function
test_data = pd.read_json(DATA_FILE, lines=True, compression="gzip", nrows=100000)

In [6]:
processed_df = preprocess_collaborations(test_data)
processed_df[['title', 'categories','description', 'tags']].sample(50)

Unnamed: 0,title,categories,description,tags
40243,Tina y Tin cumple Camila🌸 👩🏾 (Canciones Infan...,Music,Feliz cumpleaños Calvin 😊\n🎁 ►¡De todo para lo...,"Tina y Tin cumple Camila,Tina y Tin,cumple,Cam..."
98785,Ranveer Singh HILARIOUS On Screen Kissing Demo...,Entertainment,In the second part of their Bollywood Hungama ...,"ranveer singh,vaani kapoor,befikre trailer,Bef..."
92324,UNCUT | Super Fight League Season 2 Launch | T...,Entertainment,The launch of Super fight league season 2 happ...,"bollywood hungama interviews,exclusive intervi..."
88311,Red carpet event of awards at Jagran film fest...,Entertainment,"Also, do not forget to subscribe to Bollywood ...","bollywood hungama interviews,exclusive intervi..."
35217,ED SHEERAN - THE A TEAM - Easy Piano Tutorial,Music,Sheet music: https://mnot.es/2PuTy2u\nSUBSCRIB...,"Easy Piano,Piano Tutorial,Ed Sheeran The A Tea..."
40257,tina y tin + lorea (Personalized Songs For Kid...,Music,Escucha el album completo de tina y tin + lore...,"tina,tin,lorea"
16299,The Script - You Won't Feel A Thing - live,Entertainment,"The Script, concert at the Ziggo Dome in Amste...",M4H03939
86029,Star Studded Red Carpet of Times Auto Awards 2...,Entertainment,"Also, do not forget to subscribe to Bollywood ...","bollywood hungama interviews,exclusive intervi..."
13289,Mezco Toyz One:12 Collective Deluxe Joker A.C....,Entertainment,Order your Mezco Popeye figure here:\nhttp://b...,"A.C.B.A.,Articulated Comic Book Art,Art,Toys,A..."
99651,Remo D'Souza | Tiger Shroff | A Flying Jatt |G...,Entertainment,"Remo D'Souza, Tiger Shroff, COO Of Hungama Dig...","a flying jatt,tiger shroff,a flying jatt trail..."


In [7]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Save the model to disk
model_path = "src/models/zero-shot-classification"
classifier.save_pretrained(model_path)

In [8]:
def is_significant(collab_score, non_collab_score, threshold=0.1):
    """
    Check if the difference between two scores is significant.

    Args:
        collab_score (float): Score for the collaboration label.
        non_collab_score (float): Score for the non-collaboration label.
        threshold (float): Minimum difference to consider the scores significant.

    Returns:
        bool: True if the scores are significant, False otherwise.
    """
    return abs(collab_score - non_collab_score) > threshold

In [9]:
# Labels for the zero-shot classification model
music_collab_labels = ['multiple artists']
music_non_collab_labels = ['a single artist']
music_labels = music_collab_labels + music_non_collab_labels

# Hypothesis template for the zero-shot classification model
music_hypothesis_template = "This music is with {}."

texts = ["Radiohead live at Glastonbury 2003",
         "The Beatles - Hey Jude", 
         "Best of Debussy",
         "HAYATO SUMINO – third round (18th Chopin Competition, Warsaw)",
         "LL COOL J - Murdergram Deux ft. Eminem", 
         "Eminem - Love The Way You Lie ft. Rihanna"
         ]

for text in texts:
    result = classifier(
        text,
        candidate_labels=music_labels,
        hypothesis_template=music_hypothesis_template,
        multi_label=False,
    )

    # Extract scores for collaboration and non-collaboration labels
    collab_scores = np.array([score for label, score in zip(result['labels'], result['scores']) if label in music_collab_labels])
    non_collab_scores = np.array([score for label, score in zip(result['labels'], result['scores']) if label in music_non_collab_labels])

    total_collab_score = collab_scores.sum()
    total_non_collab_score = non_collab_scores.sum()

    # Check if the scores are significant
    if is_significant(total_collab_score, total_non_collab_score, threshold=0.5):
        # Décider de la catégorie basée sur le score maximal
        if total_collab_score > total_non_collab_score:
            prediction = "Collaborative"
        else:
            prediction = "Non-Collaborative"
    else:
        continue

    print(f"Text: {text}. "
          f"\n    - Predicted category: {prediction}, "
          f"\n    - collab_score: {total_collab_score}, "
          f"\n    - non_collab_score: {total_non_collab_score}")

Text: The Beatles - Hey Jude. 
    - Predicted category: Non-Collaborative, 
    - collab_score: 0.121126189827919, 
    - non_collab_score: 0.8788738250732422
Text: Best of Debussy. 
    - Predicted category: Non-Collaborative, 
    - collab_score: 0.06659115850925446, 
    - non_collab_score: 0.9334088563919067
Text: HAYATO SUMINO – third round (18th Chopin Competition, Warsaw). 
    - Predicted category: Non-Collaborative, 
    - collab_score: 0.17599695920944214, 
    - non_collab_score: 0.8240030407905579
Text: LL COOL J - Murdergram Deux ft. Eminem. 
    - Predicted category: Collaborative, 
    - collab_score: 0.9665012955665588, 
    - non_collab_score: 0.03349871188402176
Text: Eminem - Love The Way You Lie ft. Rihanna. 
    - Predicted category: Collaborative, 
    - collab_score: 0.9764209389686584, 
    - non_collab_score: 0.0235790703445673
