In [None]:
pip install datasets

In [None]:
from datasets import load_dataset

dataset_name = "The-data-company/TikTok-10M"
dataset = load_dataset(dataset_name)

print(f"Dataset '{dataset_name}' loaded successfully")
print(dataset)

In [None]:
import re

# 1. EXCLUSION KEYWORDS (The "Anti-List")
exclusion_keywords = [
    # --- Category: Tech/Gaming ---
    'computer', 'software', 'gaming', 'gameplay', 'fortnite', 'tech', 'glitch',

    # --- Category: Beauty/Fashion/Salon/MedSpa (Added MedSpa terms) ---
    'makeup', 'lipstick', 'fashion', 'outfit', 'dress', 'salon', 'stylist',
    'barber', 'haircut', 'locs', 'braids', 'hair transformation', 'hairtok',
    'lash tech', 'nails', 'nail art', 'haircare', 'scalp care',
    'medspa', 'facial', 'esthetician', 'aesthetics', 'pamper', 'botox', 'filler',

    # --- Category: Spam/Merch/Keyword Stuffing (Added for Entry 164) ---
    'link in bio', 'link in my bio', 'small business', 'smallbusiness',
    'shop my', 'merch', 'discount code', 'use code', 'sticker', 'hello kitty',
    'growmyaccount', 'grow my account', 'follow back', 'check bio',

    # --- Category: Violence/Politics/Crime (Expanded for Entry 199) ---
    'weapon', 'gun', 'military', 'vote', 'politics',
    'murder', 'crime', 'arrested', 'charged', 'prison', 'police', 'cops',
    'lawsuit', 'true crime', 'serial killer', 'courtroom', 'guilty plea',
    'sentenced', 'misdemeanor',

    # --- Category: Animals ---
    'veterinarian', 'vet nurse', 'veterinary',
    'dog', 'cat', 'puppy', 'kitten', 'animal',
    'turtle', 'wildlife', 'zoo', 'aquarium', 'chimp', 'monkey',
    'feline', 'canine', 'pet', 'meow', 'bark',
    'savannah cat', 'wildcat', 'dogtok', 'cattok', 'fostercat',

    # --- Category: TV/Entertainment/ASMR/Party (Added Halloween/Fantasy) ---
    'meme', 'johnny sins', 'funny video', 'comedy', 'prank', 'pov',
    'relatable', 'blowthisup', 'trend',
    'netflix', 'series', 'tv show', 'actor', 'actress', 'drama',
    'asmr', 'satisfying', 'does anybody else',
    'halloween', 'jello shot', 'party', 'harry potter', 'muggle', 'hogwarts', # Entry 113
    'spongemike', 'spongebob', 'ice spice',

    # --- Category: Food Jokes (Stop "This coffee is diabetes" jokes) ---
    'iced coffee', 'starbucks', 'dunkin', 'chickfila', 'chick-fil-a',
    'milkshake', 'frappuccino', 'whipped cream',

    # --- TV SHOWS ---
    'greys anatomy', 'greysanatomy', 'grey\'s anatomy', 'meredith grey', 'shonda rhimes',
    'greysabc', 'housemd', 'house md', 'drhouse', 'dr house', 'gregory house', 'hugh laurie',

    # --- Category: Alternative Medicine / Social ---
    'baby name', 'babyname', 'name reveal', 'pronounce this name',
    'chiropractor', 'chiropractic', 'chiro',
    'horoscope', 'zodiac'
]

# 2. STRONG KEYWORDS (Score = 3)
strong_keywords = [
    'cardiology', 'oncology', 'pediatrics', 'dermatology', 'radiology',
    'prescription', 'symptoms', 'vaccine', 'epidemic',
    'pathology', 'clinical', 'dosage', 'surgeon', 'diabetes', 'cancer',
    'melanoma', 'biopsy', 'anatomy', 'physiology', 'resident', 'medschool',
    'tongue tie', 'procedure', 'physical therapy', 'psychotherapy',
    'respiratory', 'infection', 'disease', 'triage', 'medical history',
    'pelvic floor', 'obgyn', 'gestational diabetes', 'miscarriage',
    'pcos', 'endometriosis', 'infertility', 'meningitis', 'sepsis',
    'hyperkalemia', 'potassium' # Added from Entry 160
]

# 3. WEAK KEYWORDS (Score = 1)
weak_keywords = [
    'doctor', 'nurse', 'medicine', 'hospital', 'health', 'med',
    'care', 'sick', 'pain', 'recovery', 'checkup', 'patient',
    'scrubs', 'white coat', 'treatment', 'therapist', 'exam',
    'insurance', 'copay', 'diagnosis'
]

# Compile Regex
excl_pattern = re.compile(r'\b(' + '|'.join([re.escape(w) for w in exclusion_keywords]) + r')\b', re.IGNORECASE)
strong_pattern = re.compile(r'\b(' + '|'.join([re.escape(w) for w in strong_keywords]) + r')\b', re.IGNORECASE)
weak_pattern = re.compile(r'\b(' + '|'.join([re.escape(w) for w in weak_keywords]) + r')\b', re.IGNORECASE)

def medical_score_filter(text):
    if not isinstance(text, str):
        return False

    # 1. Fail Fast on Exclusions
    if excl_pattern.search(text):
        return False

    # 2. Score Calculation
    score = 0
    if strong_pattern.search(text):
        score += 3

    # Count UNIQUE weak words
    weak_matches = set(weak_pattern.findall(text.lower()))
    score += len(weak_matches)

    # 3. Threshold (>= 2)
    return score >= 2

medical_content_dataset = dataset['train'].filter(lambda x: medical_score_filter(x['desc']))

print(f"Final Dataset Size: {len(medical_content_dataset)}")

In [None]:
import hashlib

def deduplicate_dataset(dataset):
    """
    Removes rows with identical descriptions.
    Uses SHA256 hashing for memory efficiency on large datasets.
    """
    seen_hashes = set()
    unique_indices = []

    print(f"Starting deduplication on {len(dataset)} rows...")

    for i, example in enumerate(dataset):
        text = example['desc']
        if text is None:
            text = ""

        # We strip whitespace to ensure " Hello " and "Hello" are treated as duplicates
        text_hash = hashlib.sha256(text.strip().lower().encode('utf-8')).hexdigest()

        if text_hash not in seen_hashes:
            seen_hashes.add(text_hash)
            unique_indices.append(i)

    deduped_dataset = dataset.select(unique_indices)

    print(f"Removed {len(dataset) - len(deduped_dataset)} duplicates.")
    print(f"Final unique count: {len(deduped_dataset)}")

    return deduped_dataset

final_dataset = deduplicate_dataset(medical_content_dataset)

print("Dataset", len(final_dataset))

In [None]:
final_dataset.to_csv("tiktok_medical_dataset.csv", index=False)

In [None]:
final_dataset_2 = final_dataset.filter(lambda x: x['comment_count'] >= 1000)
final_dataset_2.num_rows

In [None]:
final_dataset_2.to_csv("tiktok_medical_dataset_with_1000+_comments.csv", index=False)