In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string

In [None]:
num_rows = 200000
np.random.seed(42)
random.seed(42)

In [None]:
categories = ['Music', 'Education', 'Gaming', 'Technology', 'News', 'Entertainment', 'Sports', 'Film', 'Comedy']
methods = ['Automated (Content ID)', 'Manual', 'User Report']
match_types = ['audio', 'video', 'audio+video', 'lyrics', 'metadata']
actions = ['Blocked', 'Muted (audio)', 'Monetized by claimant', 'Takedown', 'Age-Restricted', 'Warning', 'None']
disputes = ['None', 'Pending', 'Filed by uploader', 'Resolved - in favor of claimant', 'Resolved - in favor of uploader']
resolutions = ['Claim upheld', 'Claim withdrawn', 'Settlement', 'Uploader win', 'Manual override', None]
claimant_types = ['Label', 'Publisher', 'Individual', 'MCN', 'Platform']
indian_states = [
    'Maharashtra', 'Karnataka', 'Tamil Nadu', 'Uttar Pradesh', 'Delhi', 'Gujarat',
    'West Bengal', 'Madhya Pradesh', 'Rajasthan', 'Bihar', 'Telangana',
    'Haryana', 'Kerala', 'Punjab', 'Odisha'
]

In [None]:
def random_video_id():
    # YouTube-like 11-character video ID (letters, digits, _ and -)
    chars = string.ascii_letters + string.digits + "_-"
    return ''.join(random.choice(chars) for _ in range(11))

In [None]:
def random_video_title():
    # Random realistic video titles
    topics = [
        "Music Mix", "Gaming Stream", "Tutorial", "Movie Scene", "News Update",
        "Vlog", "Comedy Skit", "Tech Review", "Live Performance", "Motivational Talk"
    ]
    adjectives = ["Epic", "Funny", "Best", "Top", "Latest", "Crazy", "Exclusive", "Viral"]
    topic = random.choice(topics)
    adj = random.choice(adjectives)
    return f"{adj} {topic} {random.randint(1, 999)}"

In [None]:
df = pd.DataFrame({
    "video_id": [random_video_id() for _ in range(num_rows)],
    "video_title": [random_video_title() for _ in range(num_rows)],
    "channel_id": [f"CH_{random.randint(1000, 9999)}" for _ in range(num_rows)],
    "channel_name": [f"Channel_{random.randint(1000, 9999)}" for _ in range(num_rows)],
    "upload_date": [datetime(2024, 1, 1) + timedelta(days=random.randint(0, 364)) for _ in range(num_rows)],
    "video_duration_seconds": np.random.randint(30, 3600, size=num_rows),
    "views": np.random.randint(100, 5000000, size=num_rows),
    "likes": np.random.randint(0, 100000, size=num_rows),
    "comments_count": np.random.randint(0, 5000, size=num_rows),
    "content_category": np.random.choice(categories, size=num_rows),
    "detected_copyright_flag": np.random.choice([True, False], size=num_rows, p=[0.4, 0.6]),
    "flag_detection_method": np.random.choice(methods, size=num_rows),
    "match_type": np.random.choice(match_types, size=num_rows),
    "claimant_id": [f"CLM_{random.randint(10000, 99999)}" for _ in range(num_rows)],
    "claimant_name": [f"Claimant_{random.randint(100, 999)}" for _ in range(num_rows)],
    "claimant_type": np.random.choice(claimant_types, size=num_rows),
    "claim_date": [
        datetime(2024, 1, 1) + timedelta(days=random.randint(0, 364)) if random.random() < 0.4 else None
        for _ in range(num_rows)
    ],
    "action_taken": np.random.choice(actions, size=num_rows),
    "enforcement_scope": np.random.choice(['State-specific', 'National'], size=num_rows, p=[0.6, 0.4]),
    "state_affected": np.random.choice(indian_states, size=num_rows),
    "notice_id": [f"NOTICE_{random.randint(100000, 999999)}" for _ in range(num_rows)],
    "dispute_status": np.random.choice(disputes, size=num_rows),
    "resolution_outcome": np.random.choice(resolutions, size=num_rows),
    "strike_count_on_channel": np.random.randint(0, 3, size=num_rows),
    "repeat_infringer": np.random.choice([True, False], size=num_rows, p=[0.1, 0.9]),
    "similarity_score": np.round(np.random.uniform(0.5, 1.0, size=num_rows), 2),
    "detection_confidence": np.random.randint(50, 100, size=num_rows),
    "resolution_time_days": np.round(np.random.uniform(1, 60, size=num_rows), 1),
    "estimated_revenue_impact_usd": np.round(np.random.uniform(0, 500, size=num_rows), 2),
    "manual_review_required": np.random.choice([True, False], size=num_rows, p=[0.3, 0.7])
})

In [None]:
output_file = "youtube_copyright_india_2024.csv"
df.to_csv(output_file, index=False)
print(f"✅ Dataset generated successfully: {output_file}")
print(df.head())

✅ Dataset generated successfully: youtube_copyright_india_2024.csv
      video_id                  video_title channel_id  channel_name  \
0  odJFCrnl2ed  Latest Live Performance 630    CH_6199  Channel_6595   
1  lBDdz1C5Jau      Crazy Gaming Stream 119    CH_7195  Channel_9140   
2  2RJtBRnlWmT      Crazy Gaming Stream 504    CH_3072  Channel_2094   
3  SHf6pWkLUyi               Funny Vlog 521    CH_4453  Channel_6900   
4  fDLkDmWJ6Uu    Best Live Performance 431    CH_2733  Channel_8012   

  upload_date  video_duration_seconds    views  likes  comments_count  \
0  2024-07-07                    3204  3481360  56547            4672   
1  2024-09-15                    3537  4682686  82839            1085   
2  2024-11-12                     890  1182519  67219            1627   
3  2024-02-11                    1324   391026  94084            1064   
4  2024-01-30                    1160  1567379  94226             744   

  content_category  ...      notice_id                   disp

In [None]:
from google.colab import files
files.download("youtube_copyright_india_2024.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>