In [1]:
import pandas as pd
import re
from datetime import datetime
import sys

def analyze_video_quality(csv_file):
    """
    Analyze videos and score them based on quality indicators.
    Returns a scored and sorted DataFrame.
    """
    # Read the CSV
    df = pd.read_csv(csv_file)

    # Initialize quality score
    df['quality_score'] = 0
    df['score_breakdown'] = ''

    # 1. Title Quality (max 25 points)
    # Penalize clickbait indicators, reward descriptive titles
    def score_title(title):
        score = 15  # base score
        breakdown = []

        if pd.isna(title):
            return 0, "No title"

        title_lower = title.lower()

        # Penalize excessive punctuation/caps
        if title.count('!') > 2 or title.count('?') > 2:
            score -= 5
            breakdown.append("-5 excessive punctuation")

        # Penalize all caps words (clickbait indicator)
        caps_words = sum(1 for word in title.split() if word.isupper() and len(word) > 2)
        if caps_words > 2:
            score -= 5
            breakdown.append("-5 too many caps")

        # Reward reasonable length (40-80 chars is optimal)
        title_len = len(title)
        if 40 <= title_len <= 80:
            score += 10
            breakdown.append("+10 good length")
        elif title_len < 20:
            score -= 5
            breakdown.append("-5 too short")

        return score, '; '.join(breakdown)

    df['title_score'], df['title_notes'] = zip(*df['title'].apply(score_title))
    df['quality_score'] += df['title_score']

    # 2. Description Quality (max 25 points)
    def score_description(desc):
        score = 0
        breakdown = []

        if pd.isna(desc):
            return 0, "No description"

        desc_len = len(desc)

        # Reward substantial descriptions
        if desc_len > 200:
            score += 15
            breakdown.append("+15 detailed desc")
        elif desc_len > 100:
            score += 10
            breakdown.append("+10 decent desc")
        elif desc_len > 50:
            score += 5
            breakdown.append("+5 basic desc")

        # Check for hashtag spam
        hashtag_count = desc.count('#')
        if hashtag_count > 10:
            score -= 10
            breakdown.append("-10 hashtag spam")
        elif hashtag_count > 5:
            score -= 5
            breakdown.append("-5 many hashtags")

        # Reward informative content
        info_keywords = ['learn', 'guide', 'tutorial', 'explain', 'how to', 'tips']
        if any(keyword in desc.lower() for keyword in info_keywords):
            score += 10
            breakdown.append("+10 educational")

        return score, '; '.join(breakdown)

    df['desc_score'], df['desc_notes'] = zip(*df['description'].apply(score_description))
    df['quality_score'] += df['desc_score']

    # 3. Channel Quality (max 20 points)
    def score_channel(channel):
        score = 10  # base score
        breakdown = []

        if pd.isna(channel):
            return 0, "No channel"

        # Reward established naming patterns
        if len(channel) > 5:
            score += 10
            breakdown.append("+10 established channel")

        return score, '; '.join(breakdown)

    df['channel_score'], df['channel_notes'] = zip(*df['channelTitle'].apply(score_channel))
    df['quality_score'] += df['channel_score']

    # 4. Recency (max 15 points)
    def score_recency(publish_time):
        if pd.isna(publish_time):
            return 0, "No date"

        try:
            pub_date = pd.to_datetime(publish_time)
            now = datetime.now()
            days_old = (now - pub_date).days

            if days_old < 30:
                return 15, "+15 very recent"
            elif days_old < 90:
                return 12, "+12 recent"
            elif days_old < 180:
                return 8, "+8 somewhat recent"
            elif days_old < 365:
                return 5, "+5 within year"
            else:
                return 2, "+2 older content"
        except:
            return 0, "Invalid date"

    df['recency_score'], df['recency_notes'] = zip(*df['publishTime'].apply(score_recency))
    df['quality_score'] += df['recency_score']

    # 5. Language Consistency (max 15 points)
    def score_language(title, desc, channel):
        score = 15  # base score
        breakdown = []

        # Check if content is primarily in one language
        # This is a simple heuristic - mix of languages might indicate lower quality
        has_english = any(pd.notna(x) and bool(re.search('[a-zA-Z]', str(x)))
                         for x in [title, desc, channel])
        has_arabic = any(pd.notna(x) and bool(re.search('[\u0600-\u06FF]', str(x)))
                        for x in [title, desc, channel])
        has_other = any(pd.notna(x) and bool(re.search('[^\u0000-\u007F\u0600-\u06FF]', str(x)))
                       for x in [title, desc, channel])

        lang_count = sum([has_english, has_arabic, has_other])

        if lang_count > 1:
            score -= 5
            breakdown.append("-5 mixed languages")

        return score, '; '.join(breakdown) if breakdown else "+15 consistent"

    df['lang_score'], df['lang_notes'] = zip(*df.apply(
        lambda x: score_language(x['title'], x['description'], x['channelTitle']),
        axis=1
    ))
    df['quality_score'] += df['lang_score']

    # Combine all breakdown notes
    df['score_breakdown'] = (
        df['title_notes'] + ' | ' +
        df['desc_notes'] + ' | ' +
        df['channel_notes'] + ' | ' +
        df['recency_notes'] + ' | ' +
        df['lang_notes']
    )

    # Sort by quality score (highest first)
    df = df.sort_values('quality_score', ascending=False)

    return df

def display_results(df, top_n=10):
    """Display top N videos with their scores"""
    print(f"\n{'='*100}")
    print(f"TOP {top_n} HIGHEST QUALITY VIDEOS")
    print(f"{'='*100}\n")

    for idx, row in df.head(top_n).iterrows():
        print(f"Rank: {list(df.index).index(idx) + 1}")
        print(f"Quality Score: {row['quality_score']:.0f}/100")
        print(f"Title: {row['title'][:80]}...")
        print(f"Channel: {row['channelTitle']}")
        print(f"Published: {row['publishTime'][:10]}")
        print(f"URL: {row['url']}")
        print(f"Score Breakdown: {row['score_breakdown']}")
        print(f"{'-'*100}\n")

def save_results(df, output_file='video_analysis_results.csv'):
    """Save scored results to a new CSV"""
    # Select relevant columns
    output_cols = ['videoId', 'title', 'channelTitle', 'publishTime',
                   'url', 'searchTerm', 'quality_score', 'score_breakdown']

    df[output_cols].to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")

if __name__ == "__main__":
    # Usage
    csv_file = '/content/videos.csv'  # Change this to your file path

    try:
        print("Analyzing videos...")
        df_scored = analyze_video_quality(csv_file)

        # Display top 10 videos
        display_results(df_scored, top_n=100)

        # Save all results
        save_results(df_scored)

        # Display statistics
        print(f"\n{'='*100}")
        print("QUALITY DISTRIBUTION")
        print(f"{'='*100}")
        print(f"Average Score: {df_scored['quality_score'].mean():.1f}")
        print(f"Median Score: {df_scored['quality_score'].median():.1f}")
        print(f"Highest Score: {df_scored['quality_score'].max():.1f}")
        print(f"Lowest Score: {df_scored['quality_score'].min():.1f}")
        print(f"\nTotal Videos Analyzed: {len(df_scored)}")

    except FileNotFoundError:
        print(f"Error: Could not find '{csv_file}'")
        print("Please make sure the file exists in the current directory.")
    except Exception as e:
        print(f"Error: {e}")

Analyzing videos...

TOP 100 HIGHEST QUALITY VIDEOS

Rank: 1
Quality Score: 80/100
Title: Sunset River Reflection STEP by STEP Acrylic Painting Tutorial...
Channel: ColorByFeliks
Published: 2025-05-29
URL: https://www.youtube.com/watch?v=Yqvawojuwu4
Score Breakdown: +10 good length | +10 decent desc; +10 educational | +10 established channel | Invalid date | +15 consistent
----------------------------------------------------------------------------------------------------

Rank: 2
Quality Score: 80/100
Title: The mindset of a 3D Artist... How to not &quot;lose it&quot; ( LIVESTREAM )...
Channel: MH Tutorials
Published: 2021-05-05
URL: https://www.youtube.com/watch?v=n4wwCSJWjE4
Score Breakdown: +10 good length | +10 decent desc; +10 educational | +10 established channel | Invalid date | +15 consistent
----------------------------------------------------------------------------------------------------

Rank: 3
Quality Score: 80/100
Title: Palette knife OIL demo | 5 TIPS for finding your