In [None]:
# TODO catégories très grandes, histograme de quels mots ont le plus recruté de vidéos

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np 
import os 
import os.path as op 
import sys
from tqdm.notebook import tqdm

import re

sys.path.insert(0, '..') # project folder
from config import * # all paths to files and constants

In [2]:
edu = pd.DataFrame()
for i in range(N_BATCHES):
    batch = pd.read_csv(path_edu.format(i), index_col=0)
    edu = pd.concat([edu, batch])
edu = edu.rename(columns={'channel_id': 'channel', 'categories': 'vid_category'})
channels = pd.read_csv(path_channels, compression="infer", sep="\t")
channels = channels[['channel', 'name_cc', 'category_cc']]
edu = edu[['display_id', 'vid_category', 'channel', 'title', 'tags']]
edu = edu.merge(channels, on='channel', how='left')
#edu.head()

In [3]:
del channels, batch
edu['text'] = edu['title'].fillna('') + ' ' + edu['tags'].fillna('')
edu = edu.drop(columns=['title', 'tags'])

This notebook extracts the subcategories by keyword search using a human and bottom up approach. Since looking for single expressions is way faster than combinations we start by looking at very specific words in the biggest categories (eg. gaming). We proceed with other niche categories that are heavily represented in our observations. Then we group with more complex regular expressions but hopefully on a much smaller dataset. The order of classification is thus important, even crucial. We used personal knowledge, chatGPT, the web, youtube, twitch and copilot to curate the lists of expressions for the categories. Instead of applying complicated patterns with regexp it is preferable to do a big search and then sort out outliers (eg. search for 'python' then sort out all that contain 'animal' and 'snake'). After having run many specific keywords, run the general on the remaining data and try to identify what other keywords one could add.

# Pipeline for text processing

In [4]:
def clean_non_word(text): #punctuation, underscores seem to evade this regex so add it
        return  re.sub(r'[^\w\s]|_+', ' ', text)

def clean_non_ascii(text): # indian symbols that might still be left
        return  re.sub(r'[^\x00-\x7F]+', ' ', text)

In [5]:
edu['text'] = edu['text'].apply(clean_non_word).apply(clean_non_ascii)

In [None]:
""" would take 13h
import spacy

tqdm.pandas()
nlp = spacy.load("en_core_web_sm")
def lemmatize(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(lemmas)
edu['lemma'] = edu.text.progress_apply(lemmatize)"""

In [6]:
edu['category'] = 'unclass'

# Testing

In [None]:
# Sample for testing
df = edu.sample(frac=0.1, random_state=42, axis =0, ignore_index=True)
len(df)

In [12]:
keyword = 'audiobook\w*'
case = False
pattern = rf'{keyword}' # r"(?=.*\bpython\b)(?=.*\b(programming|program|code|coding)\b)"  
keyword = df[df.text.str.contains(pattern, case = case, na ='', regex = True)]

In [None]:
keyword.text.values

In [105]:
(keyword[['name_cc', 'text']]).to_csv(op.join('keys', 'python.csv'))

# Classsification

In [38]:
import time
def assign_category(keywords:list, case:bool = False, category:str = 'unclass', edu = edu, pre_b = True, post_b = True, bar_text ='', verbose = False):
    pre = ''; post = ''
    if pre_b:  pre = r'\b'
    if post_b: post = r'\b'

    keywords = [pre+k+r'\w*'+post for k in keywords]
    keywords = '|'.join(keywords)
    pattern = rf'{keywords}'
    
    start = time.time()
    if verbose: print(pattern)
    edu.loc[(edu.category == 'unclass') & edu.text.str.contains(pattern, case = case, na ='', regex = True) , 'category'] = category
    stop = time.time()
    print(f'Category {bar_text} now has {len(edu[edu.category == category])} videos, ellapsed time: {stop-start:.2f} s')

def detect_left(keywords, edu = edu, case:bool = False, pre_b = True, post_b = True, cat = 'unclass'):
    df = edu[edu.category == cat]
    pre = ''; post = ''
    if pre_b:  pre = r'\b'
    if post_b: post = r'\b' 
    keywords = [pre+k+r'\w*'+post for k in keywords]
    keywords = '|'.join(keywords)
    pattern = rf'{keywords}'
    df = df.loc[df.text.str.contains(pattern, case = case, na ='', regex = True)]
    return df
"""
for keyword in tqdm(keywords, total = len(keywords), desc = bar_text):
    pattern = rf'{keyword}'
    edu.loc[(edu.category == 'unclass') & edu.text.str.contains(pattern, case = case, na ='', regex = True) , 'category'] = category
print(f'This category now has {len(edu[edu.category == category])} videos')
"""

"\nfor keyword in tqdm(keywords, total = len(keywords), desc = bar_text):\n    pattern = rf'{keyword}'\n    edu.loc[(edu.category == 'unclass') & edu.text.str.contains(pattern, case = case, na ='', regex = True) , 'category'] = category\nprint(f'This category now has {len(edu[edu.category == category])} videos')\n"

In [None]:
regexp_dict = {
    '4': r'\bcoding\b|\bprogramming\b|\bpython\b|\bhtml\b|\bjavascript\b|\bweb development\b|\bweb design',
    '9': r'\bgaming|\besports\b',
    '91': r'roblox',
    '92' : r'minecraft',
}

In [None]:
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

In [None]:
# Check classification
key = '91'
edu[edu.category == key].sample(10).text.values

In [37]:
edu.category.value_counts() 

category
unclass    3000980
8           232214
20          123783
86          104118
4            81481
q            54677
87           43236
7            39087
49           31709
android      21923
79           14804
81            8848
9             7350
a             4781
72            3840
92            3542
82            3095
41            2314
96            1774
74            1772
98            1674
71            1628
73            1308
91            1266
97            1246
83            1215
94             637
95             569
85             313
75             308
84              72
Name: count, dtype: int64

In [None]:
# reset a category
# edu.loc[edu.category == '20', 'category'] = 'Ole'

### TODO before others

In [8]:
kids = [r"nursery rhyme",r"sing.along",r"Twinkle Twinkle",r"Row Row Row Your Boat",r"Old MacDonald Had a Farm",r"Baa Baa Black Sheep",r"The Wheels on the Bus",r"If You.re Happy and You Know It",r"Itsy Bitsy Spider",r"Five Little Monkey",r"Baby Shark",r"Johnny Johnny Yes Papa",r"London Bridge is Falling Down",r"Head Shoulders Knees",
    r"learn color",r"learn shapes",r"learn numbers",r"alphabet songs",r"learn ABC",r"counting songs",r"preschool learning",r"early learning",r"kindergarten",r"how to count",r"learning to read",
    r"Cocomelon",r"Peppa Pig",r"Blippi",r"Baby Shark",r"Paw Patrol",r"Bluey",r"Dora the Explorer",r"Thomas and Friend",r"Masha and the Bear",r"Barney and Friend",r"Teletubb",
    r"kids storytelling",r"family-friendly",r"playtime",r"DIY for kid",r"finger painting",r"pretend play",r"puppet show",
    r"learn animal",r"learn vehicle", r"learn the days",r"learn the months", 'learn animal']
tricky = [r"kid\w* educational song",r"kid\w* song", r"kid\w* dance",r"educational video\w* for kid",r"cartoon\w* for kid", r"bedtime stor\w*",r"kid\w* craft",
          r"kid\w* sing.along",r"kid\w* comedy",r"kid\w* yoga",r"kid\w* exercise",r"kid\w* party song",r"animal sound\w* for kid",r"chil\w+ song",r"bab\w+ song",r"lullab\w+"]
assign_category(kids, case = False, category = '20')
assign_category(tricky, case = False, category = '20')

Category  now has 101329 videos, ellapsed time: 271.19 s
Category  now has 116459 videos, ellapsed time: 101.88 s


In [None]:
conspiracy = ['conspiracy', 'Qanon', "hoax", "false flag", "deep state", "new world order",     "shadow government", "illuminati",  "psyop", "plandemic",     
              "government mind control","rigged system", "big brother", "surveillance state", "censorship", "black budget", "Area 51",  "crop circles", "ancient aliens",    
              "reptilian", "hollow Earth", "interdimensional beings",  "Bermuda Triangle", "vaccine hoax", "anti-vax", "chemtrails", "fluoride",  "pandemic hoax", "climate hoax", "genetic engineering", "eugenics", "population control", "artificial intelligence takeover", "fake science", 
            "JFK assassination", "9.11 inside job", "moon landing hoax", "flat Earth",'flatist', "holocaust denial", "Freemasons", "Knights Templar", "Zionist conspiracy", "Operation Northwoods",
            "pizzagate", "mass media manipulation", "big tech", "mass surveillance", "AI singularity", "robot uprising", 
            "occult", "satanic ritual", "luciferian", "black magic", "symbolism", "numerology", "astrology conspiracy", "esoteric knowledge", "deep state"
            "occult rituals", "secret worship", "paganism cover-up", "Epstein island","Q drop", "Q clearance", "The Storm", "The Great Awakening","WWG1WGA","elite cabal", "pedo ring", "adrenochrome",  "Great Reset",
    "Hollywood pedophiles","media lie", "sheeple", "Guantanamo Bay", "traitors", "red October"]

assign_category(conspiracy, case = False, category = 'q', bar_text = 'Conspiracy')

In [9]:
assign_category(['roblox'], category = '91', bar_text='Roblox')
assign_category(['minecraft'], category = '92', bar_text='Minecraft')
assign_category(['pubg'], category = '94', bar_text='PUBG')
assign_category(['league of legend'], category = '95', bar_text='LoL')
assign_category(['call of duty'], category = '96', bar_text='CoD')
assign_category(['super mario', 'mario kart'], category = '97', bar_text='Super Mario')
assign_category(['pokemon'], category = '98', bar_text='Pokemon')

Category Roblox now has 1274 videos, ellapsed time: 28.78 s
Category Minecraft now has 3553 videos, ellapsed time: 28.05 s
Category PUBG now has 639 videos, ellapsed time: 28.20 s
Category LoL now has 581 videos, ellapsed time: 26.55 s
Category CoD now has 1786 videos, ellapsed time: 26.59 s
Category Super Mario now has 1251 videos, ellapsed time: 36.68 s
Category Pokemon now has 1709 videos, ellapsed time: 27.11 s


In [10]:
assign_category(['audiobook'], case = False, category = 'a')
assign_category(['wwii', 'wwi'], case = False, category = '20')

Category  now has 4975 videos, ellapsed time: 26.89 s
Category  now has 123981 videos, ellapsed time: 29.73 s


In [None]:
edu.category.value_counts()

### Programming

In [11]:
ml = [r'neural network', r'Deep Learning', r'Support Vector Machine', r'K.Nearest', r'K.mean', r'Random Forest', r'Markov Chain', r'AdaBoost', r'Hyperparameter', r'Overfitting', r'Gradient Descent', r'Loss Function']
#python = r"(?=.*\bpython\b)(?=.*\b(programming|program|code|coding)\b)" # very time consuming 1h30 so no go 
keywords = [r'Javascript', r'HTML', r'CSS', r'C\+\+', r'matlab', r'web development', r'software development', r'github', r'http', r'Docker', r'Vscode', r'numpy', r'matplotlib', r'scikit.learn', r'ggplot', r'plotly', r'TensorFlow', r'PyTorch', r'Jupyter']
case_sensitive = [r'AWS', r'OpenCV', r'SQL']

assign_category(ml, case = False, category = '41', bar_text = 'ML')  
assign_category(keywords, case = False, category = '4', bar_text = 'All programming')
assign_category(case_sensitive, case = True, category = '4', bar_text = 'Case sensitive')
assign_category(['SQL'], case = False, category = '4', bar_text = 'SQL', pre_b=False, post_b=False)

Category ML now has 2317 videos, ellapsed time: 87.49 s
Category All programming now has 33937 videos, ellapsed time: 120.24 s
Category Case sensitive now has 37762 videos, ellapsed time: 28.74 s
Category SQL now has 40911 videos, ellapsed time: 23.71 s


In [12]:
extra = ['command prompt', 'codeblocks', 'program[a-zA-Z]* tutorial', 'mysql', 'unreal engine', 'blender', 'VBscript', 'VB net', 'c sharp', 'for loop', 'if clause',
         r'python program[a-zA-Z]*', r'c program[a-zA-Z]*', r'java program[a-zA-Z]*', r'cnc program[a-zA-Z]*','unity3d', 'unity2d', 'unity c', 'raspberry pi', 'arduino', 'visual studio code', 'geeksforgeeks','PERL', 
         'wordpress','PHP','scala'] + ['xml', 'gamedev', r'vmc program[a-zA-Z]*', 'clever programmer','codecademy', r'R program[a-zA-Z]*', 'install python', 'wxpython', 'javafx', 'cherrypy' ]
assign_category(extra, case = False, category = '4', bar_text = 'Extra programming')
# excel, threads, pointer, natural language processing, scala tutorial

Category Extra programming now has 81711 videos, ellapsed time: 198.75 s


In [None]:
# left_programing = detect_left(['programming', 'coding','python'], case = False)
assign_category(['android'], case = False, category = 'android', bar_text = 'android', pre_b=False, post_b=False)
assign_category(['programming', 'coding','python'], case = False, category = '49', bar_text = 'Extra programming')
# left_programing.to_csv(op.join('keys', 'left_programing.csv'))

Category android now has 21956 videos, ellapsed time: 15.51 s
Category Extra programming now has 23655 videos, ellapsed time: 38.10 s


# Gaming

In [14]:
games = ['tetris', 'Grand Theft Auto', 'Epic Games',  'stardew valley' 'Riot Games', 'terraria', 'Elden Ring', 'zelda', 'overwatch', 'Counter.Strike', 'elder scrolls', 'skyrim', 'World of Warcraft', 'apex legends', 'Dead by Daylight', 'EA SPORTS', 'Hearthstone', 'Rainbow Six' ]

brands = ['ASUS ROG', 'Razer', 'SteelSeries', 'Corsair', 'HyperX', 'Logitech', 'nvidia','Thrustmaster', 'Blizzard Entertainment', 'xbox', 'Nintendo','Ubisoft']

case_sensitive = ['PvP', 'MMO', 'MOBA','DOTA', 'AMD']

assign_category(games, case = False, category = '9', bar_text = 'Games')
assign_category(brands, case = False, category = '9', bar_text = 'Brands', post_b= False)
assign_category(case_sensitive, case = True, category = '9', bar_text = 'Case sensitive')

Category Games now has 2075 videos, ellapsed time: 116.57 s
Category Brands now has 5807 videos, ellapsed time: 87.19 s
Category Case sensitive now has 6552 videos, ellapsed time: 29.85 s


In [33]:
# left = detect_left(['wii'], case = False, pre_b=False, post_b=False)
# left.to_csv(op.join('keys', 'left_wii.csv'))
# print('the witcher')
# display(detect_left(['the witcher'], case = False, pre_b=False, post_b=False))
# left = detect_left(['playstation'], case = False, pre_b=False, post_b=False)
# left.to_csv(op.join('keys', 'left_playstation.csv'))

In [15]:
extra = ['witcher 2', 'witcher ii', 'BenQ zowie', 'gameboy', 'witcher III', 'witcher 3', 'emulator', 'playstation 4', 'playstation 3','playstation 2', 'playstation 1', 'gta v', 'gta 5', 'gta iv','gta 4', 'gta 3', 'gta iii', 'playstation vita' 'bandicoot', 'assassin.s creed']
assign_category(extra, case = False, category = '9', bar_text = 'Extra gaming')

Category Extra gaming now has 7432 videos, ellapsed time: 124.43 s


In [17]:
general = ['gaming', r'esport\w*', 'gameplay', 'playstation']
# left_gaming = detect_left(general, case = False, )
# left_gaming.to_csv(op.join('keys', 'left_gaming.csv'))

In [19]:
assign_category(general, case = False, category = '49', bar_text = 'Gaming', post_b = False)

Category Gaming now has 32628 videos, ellapsed time: 40.59 s


# Music

In [21]:
popular_instruments = ['piano', 'keyboard', 'guitar', 'violin', 'drums', 'ukulele', 'uke']
instruments = ['flute', 'bass','saxophone', 'trumpet', 'clarinet', 'cello', 'trombone' , 'harp', 'sing', 'Accordion', 'bagpipes', 'Mandolin', 'marimba']
composers = ['beethoven', 'mozart', 'haydn', 'bach', 'chopin', 'tchaikovsky', 'handel', 'vivaldi', 'rachmaninoff', 'Mendelssohn', 'debussy', 'stravinsky', 'shostakovich', 'prokofiev', 'sibelius', 'mahler', 'brahms', 'schubert', 'schumann', 'liszt', 'wagner', 'verdi', 'puccini', 'rossini', 'bellini', 'donizetti', 'offenbach', 'meyerbeer', 'gounod', 'massenet', 'saint saens', 'faure', 'debussy', 'poulenc', 'milhaud', 'honneger', 'messiaen', 'dutilleux', 'ravel', 'stravinsky', 'bartok', 'shostakovich', 'prokofiev', 'schnittke', 'ligeti']
classical = ['symphony', 'Sonata', 'cantata', 'Arpeggio', 'perfect pitch', 'syncopation', 'fugue', 'opera', 'operetta',  'cantata', 'lied', 'aria','chorale',  'concerto', 'intermezzo']
styles = ['jazz', 'rock', 'reggae', 'R&B', 'heavy metal', 'hard metal', 'hip.hop', 'rap', 'disco', 'techno', 'dubstep', 'drum and bass', 'rave', 'gregorian chant', 'madrigal', 'rhapsody', 'mazurka', 'polonaise', 'waltz', 'tango', 'rumba', 'samba', 'bossa nova', 'cha cha', 'merengue',  'reggaeton', 'dancehall', 'ragga']
festivals = ['Coachella', 'Glastonbury', 'Tomorrowland', 'Lollapalooza', 'Woodstock', 'Montreux Jazz', 'Rock in Rio', ]
artists_2010s = ['Drake', 'Rihanna', 'Bruno Mars', 'Justin Bieber', 'Nicki Minaj', 'Taylor Swift', 
    'Post Malone', 'Ariana Grande', 'Maroon 5', 'Katy Perry', 'Lady Gaga', 'Lil Wayne', 
    'Ed Sheeran', 'Imagine Dragons', 'Cardi B', 'the Weeknd', 'Pitbull', 'Eminem', 
    'Chris Brown', 'One Direction', 'Selena Gomez', 'Justin Timberlake', 
    'Kesha', 'Kendrick Lamar',  'Lady Antebellum', 'Beyonce', 'Jason Aldean', 
    'Sam Smith', 'Khalid', 'Blake Shelton', 'Travis Scott', 
    'Shawn Mendes', 'Usher', 
    'Mumford...Sons', 'Travi. Scott', 'Jay.Z', 
    'Meghan Trainor', 'Big Sean',  'Coldplay', 
    'Florida Georgia Line',  'DJ Snake', 
    'Flo Rida',  'Michael Buble',  'Zac Brown Band', 'Beyonc.', 
    'Jason Derulo', 'The Chainsmokers', 'Halsey', 
    'Ludacris', 'Kanye West', 'Swae Lee', 'Kenny Chesney', 'Miley Cyrus', 
    'Macklemore', 'Carrie Underwood',  'Wiz Khalifa',  
    'Migos', 'Twenty One Pilots', 'Charlie Puth', 'Fetty Wap', 
    'The Black Eyed Peas', 'XXXTentacion', 'Eric Church', 
    'OneRepublic', 'Juice WRLD', '21 Savage', 'Young Thug', 'Billie Eilish', 
    'Ty Dolla .ign', 'LMFAO', 'Bebe Rexha', 'Britney Spears', 'Chris Stapleton', 
    'Iggy Azalea', 'Calvin Harris', '2 Chainz', 'Britney Spears',  
     'Sia', 'Ne.Yo', 'Pentatonix', 'Christina Aguilera', 'Kidz Bop Kidz', 'Juicy J', 
    'David Guetta', 'Quavo', 'U2', 'Lil Nas X', 'Ellie Goulding', 'Bastille', 'The Lumineers', 
    'Daft Punk', 'Pharrell Williams', 'The Rolling Stones', 'Marshmello', 'Train', 
    'Luke Bryan', 'Trey Songz', 'Snoop Dogg', 'Demi Lovato',  'Sam Hunt', 
    'Enrique Iglesias',  'Taio Cruz', 'Camila Cabello', 'Tyga', 'Lil Uzi Vert', 
    'Alessia Cara', 'Panic. at the Disco', 'will.i.am', 'Miranda Lambert', 
    'Bruce Springsteen', 'Kelly Clarkson', 'Jeremih',  'Charli XCX', 
    'Lil Baby', 'Thomas Rhett', 'John Legend', 'Meek Mill',  'Keith Urban', 
    'Lorde', 'Bon Jovi', 'Carly Rae Jepsen', '5 Seconds of Summer', 'Paul McCartney', 'G.Eazy']
# dances 
others = ['concert', 'karaoke', 'musical', 'garage band', 'sheet music', 'Soundtrack','songwriter', 'DJ'] # festival cooccuring with music # chord

from config import content_categories, inverted_categories 

for instr in popular_instruments:
    pattern = rf'\b{instr} tutorial\b'
    ins = instr if instr != 'uke' else 'ukulele'
    ins = ins if ins != 'keyboard' else 'piano'
    assign_category([pattern], case = False, category = inverted_categories[f'{ins} tutorial'], bar_text = instr, pre_b=False, post_b=False)

assign_category(composers, case = False, category = '87', bar_text = 'Composers')
assign_category(classical, case = False, category = '87', bar_text = 'Classical')
assign_category(popular_instruments, case = False, category = '8', bar_text = 'Popular instruments')
assign_category(instruments, case = False, category = '8', bar_text = 'Instruments')
assign_category(styles, case = False, category = '8', bar_text = 'Styles') 
assign_category(festivals, case = False, category = '8', bar_text = 'Festivals')
assign_category(artists_2010s, case = False, category = '86', bar_text = 'Artists')
assign_category(others, case = False, category = '8', bar_text = 'Terms')

Category piano now has 8784 videos, ellapsed time: 26.90 s
Category keyboard now has 8853 videos, ellapsed time: 25.30 s
Category guitar now has 3098 videos, ellapsed time: 25.40 s
Category violin now has 1215 videos, ellapsed time: 25.09 s
Category drums now has 72 videos, ellapsed time: 26.11 s
Category ukulele now has 309 videos, ellapsed time: 24.06 s
Category uke now has 313 videos, ellapsed time: 25.44 s
Category Composers now has 13953 videos, ellapsed time: 269.65 s
Category Classical now has 44518 videos, ellapsed time: 109.28 s
Category Music now has 43911 videos, ellapsed time: 61.53 s
Category Instruments now has 96742 videos, ellapsed time: 104.92 s
Category Styles now has 203782 videos, ellapsed time: 163.30 s
Category Festivals now has 204970 videos, ellapsed time: 63.29 s
Category Artists now has 104999 videos, ellapsed time: 577.20 s
Category Terms now has 231334 videos, ellapsed time: 74.90 s


In [None]:
# assign_category(['music'], case = False, category = '89', bar_text = 'music rest', backslah_b_after = False)
left_music = detect_left(['music'], case = False)
left_music.to_csv(op.join('keys', 'left_music.csv'))
#still too general

# Chess

In [22]:
# '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''
chess = ['chess opening', 'chess board', 'chess engine','chess olympiad','world chess', 'chess move','chess tournament', 'pawn promotion', 'chess composition', 'chess theory','Stalemate', 'Zugzwang', 'En passant', 'Castling', 'Gambit', 'Pawn structure', 'Sicilian Defense', 'Caro-Kann', 'King\S* Indian Defense', 'Queen\S* Gambit', 'queen sacrifice', 'rook sacrifice', 'bishop sacrifice',
         'deep blue','Ruy Lopez', 'Scandinavian Defense','Gukesh Dommaraju', 'World Chess Championship','chaturanga', 'Magnus Carlsen','José Capablanca','Vasily Smyslov','Viktor Korchnoi','Alexander Alekhine', 'Magnus Carlsen', , 'Hikaru', 'Hikaru Nakamura', 'Mikhail Botvinnik', 'Emanuel Lasker', 'Garry Kasparov', 'Maxime Vachier-Lagrave','Kasparov', 'Anatoly Karpov','Bobby Fischer', 'Shakhriyar Mamedyarov','Judit Polgar','Fabiano Caruana', 'Wesley So', 'Vishy Anand', 'Levon Aronian',
         'International Chess Federation', 'perpetual check', 'pawnless', 'Grandmaster','International Master', 'International Grandmaster','H.J.R. Murray', 'pawn structure','Lichess', 'Chess.com','Tata Steel Chess','Sinquefield Cup',
         'FIDE World', 'Bullet chess', 'blitz chess', 'rapid chess', 'fischer random', 'Stockfish', 'AlphaZero', 'Leela Chess Zero', 'Komodo','Chess puzzle']
case_sensitive = ['FIDE', 'ELO'] # 'Carlsen' tout seul peut etre pas bonne idee 
assign_category(chess, case = False, category = '8', bar_text = 'Chess')
assign_category(case_sensitive, case = True, category = '8', bar_text = 'Chesss +')

Category Chess now has 237927 videos, ellapsed time: 386.57 s
Category Chesss + now has 238094 videos, ellapsed time: 28.74 s


In [None]:
#'chess', Blunder # maybe chess lingo Nf4, checkmate, elo, pawn, middlegame, chess engine

# sports

In [31]:
football = ["goalkeeper",'uefa','cross bar challenge', "corner kick","free kick","penalty shootout","offside","video assistant referee","hat-trick","Champions League","FIFA World Cup","La Liga","Premier League","Serie A", 'Serie B'"Bundesliga","goal line technology",'cr7', 
"Lionel Messi", "Cristiano Ronaldo", "Neymar", "Kylian Mbapp.", 'Mbappe', 'Mbappé', "Zlatan","Ibrahimovic", "Luka Modric", "Karim Benzema", 'benzema', "Mohamed Salah", 
    "van Dijk", "Lewandowski", "Haaland", "Kevin De Bruyne", "Marta Vieira da Silva", "Megan Rapinoe", "Alex Morgan", "Sam Kerr", "Vivianne Miedema", "Lucy Bronze", "Ada Hegerberg", "Amandine Henry",
    "Manchester United", "Manchester City", "Arsenal", "Tottenham Hotspur", "Newcastle United", "Leeds United", "Aston Villa", "West Ham United", "Brighton & Hove Albion","Wolverhampton Wanderers", "Leicester City",  "Crystal Palace", "Brentford", "Sheffield United"
    "Real Madrid", "Barca", "Atletico Madrid", "Real Betis", "Villarreal", "Real Sociedad",  "Athletic Bilbao", "Celta Vigo", "Rayo Vallecano", "Espanyol", 
    'calcio', "AC Milan", "Inter Milan", "Juventus", "ssc Napoli", "As Roma", "ss Lazio", "Atalanta bc", "acf Fiorentina", "Torino fc", "ac Monza"
    "Bayern Munich", "Borussia Dortmund", "RB Leipzig", "Bayer Leverkusen", "Eintracht", "Borussia M.nchengladbach", "Schalke 04", "Union Berlin", "Hertha Berlin","Augsburg", "Werder Bremen", "Hamburger SV"
    "Paris Saint.Germain", "Olympique Marseille", "Olympique Lyon", "as Monaco", 
    'benfica', 'futebol clube porto',"Sporting CP", "Ajax", "PSV Eindhoven","Galatasaray", "Fenerbah.e", "Be.ikta.",
    "LA Galaxy", "Los Angeles FC", "New York City FC", "Seattle Sounders", "Atlanta United", "Inter Miami", "Orlando City SC",
]
# beware of those that are also cities 
# "Everton", "Chelsea" ,'Liverpool', "Sevilla", "Valencia" ,'napoli'
# VAR

american_football = ["quarterback","linebacker","wide receiver","running back","field goal","Super Bowl","college football","two-point conversion","yard line", "Tom Brady", "Patrick Mahomes", "Aaron Rodgers", "Peyton Manning", 
            "Joe Montana", "Drew Brees", "Russell Wilson", "Odell Beckham Jr",
            "Ezekiel Elliott", "J.J. Watt", "Travis Kelce", "Saquon Barkley",
            "Lamar Jackson", "DeAndre Hopkins", "Tyreek Hill", "Von Miller",
             "Rob Gronkowski", "Walter Payton", "Jerry Rice", "Jim Brown", "Dallas Cowboys", "New England Patriots", "Green Bay Packers", 
            "Kansas City Chiefs", "Pittsburgh Steelers", "San Francisco 49ers", 
            "Chicago Bears", "Seattle Seahawks", "New York Giants", "Buffalo Bills",
         "Los Angeles Rams", "Philadelphia Eagles", "Denver Broncos", 
          "Baltimore Ravens", "Indianapolis Colts", "Tampa Bay Buccaneers"]
#"NFL"


cricket = ['cricket',"Indian Premier League","The Ashes", "Virat Kohli", "Sachin Tendulkar", "MS Dhoni", "Steve Smith",
            "Kane Williamson", "Joe Root", "Ben Stokes", "AB de Villiers","Jacques Kalli", "Ricky Ponting", "Muttiah Muralitharan", "Shane Warne",
            "Lasith Malinga", "Brett Lee", "Ellyse Perry", "Mithali Raj","Harmanpreet Kaur", "Jhulan Goswami", "Meg Lanning", "Heather Knight",
            "Mumbai Indian", "Chennai Super King", "Kolkata Knight Rider","Sydney Sixer", "Perth Scorcher", "Surrey County Cricket", 
            "Mumbai Cricket", "Guyana Amazon Warrior", "Barbados Royal", "Karachi King", "Islamabad United", 
            "Melbourne Star", "Hampshire County Cricket", "Central Stag", "Delhi Capital"
]

basketball = ['basketball', "March Madness",
                "Los Angeles Lakers", "Boston Celtics", "Golden State Warriors","Chicago Bulls", "Miami Heat", "San Antonio Spurs", "Brooklyn Nets", "Phoenix Suns", "Dallas Mavericks", "Philadelphia 76ers", "Milwaukee Bucks", "New York Knicks", "Utah Jazz", "Denver Nuggets", 
                "Toronto Raptors", "Houston Rockets","Michael Jordan", "LeBron James",'lebron', "Kobe Bryant", "Shaquille O'Neal", "Steph Curry", "Kevin Durant", "Tim Duncan", "Magic Johnson", "Larry Bird", "Wilt Chamberlain", "Kareem Abdul-Jabbar", "Giannis Antetokounmpo","Dirk Nowitzki", "Scottie Pippen", "Sue Bird", "Diana Taurasi", "Maya Moore", "Breanna Stewart", "Candace Parker", "Lisa Leslie"
            ]
# "NBA","WNBA"

baseball = ["Babe Ruth", "Jackie Robinson", "Hank Aaron", "Willie Mays",  "Derek Jeter", "Albert Pujols", "Barry Bonds", "Clayton Kershaw",
    "Mike Trout", "Nolan Ryan", "Ted Williams", "Ken Griffey Jr.",  "Alex Rodriguez", "Joe DiMaggio", "Sandy Koufax", "Mariano Rivera", "Mookie Betts", "Shohei Ohtani", "Ichiro Suzuki", "Cy Young",
    "New York Yankees", "Los Angeles Dodgers", "Boston Red Sox", "Chicago Cubs", "San Francisco Giants", "Houston Astros", "Atlanta Braves", "St. Louis Cardinals", "Oakland Athletics", 
    "Philadelphia Phillies", "Detroit Tigers", "Cleveland Guardians", "San Diego Padres", "Minnesota Twins", "Toronto Blue Jays", 
    "Seattle Mariners", # baseball
]

sports =["athletics","goal","training","halftime","sportsmanship","medal", 'olympic', 'Jeux olympiques', "Archery", "Athletics", "Badminton", "Baseball", 'Slalom',"Boxing", r"Canoe\w*",'BMX', 'bicycle',"Cycling", "Mountain Bike", "Road Cycling", 
    "Diving", "Golf", "Gymnastics", "Trampoline", "Handball", "Hockey", "Judo", "Karate", "Pentathlon", "Rowing", "Rugby", "Skateboard", "Sport Climbing", "Surfing", "Swimming", "Table Tennis", 'ping.pong', "Taekwondo", "Tennis", "Triathlon", "Volleyball", 
    "Polo", "Weightlift", "Wrestling", "Biathlon", "Bobsleigh", "Cross-Country Skiing", "Curling", "Skating", "Freestyle Skiing", "Luge", "Ski Jump", "Snowboard",
    "Kabaddi", "Floorball", "Lacrosse", "Futsal","Kickboxing", "Muay Thai", "Jiu-Jitsu", "Kung Fu",  "Aikido", "Jeet Kune Do", "Capoeira", "Savate",  "Krav Maga", "Wushu", "Wakeboard"
    "jetski", r"Freediv\w+", r"Spearfish\w*","Squash", "Padel", "Pickleball", "Racquetball"
    "Parkour", "Bouldering", "Wingsuit", "Bungee Jumping", "Motocross", "Skating", "Bowling", "Darts",  "Sepak Takraw", "Teqball", "Petanque",  "Cheerleading",  "Boccia", "Goalball", "Archery", "Athletics", 
    "Powerlifting","Fencing","Sumo", "Kabaddi", "Pencak Silat", "Sepak Takraw", "Dragon Boat Racing", "Buzkashi", "Kitesurf",'Bodyboard', 'windsurf',"Shinty", "Basque Pelota",'Jet.Ski', 'Kayak', 'sailing' , 'Stand.Up Paddle'
]


assign_category(football, case = False, category = '71', bar_text = 'Football')
assign_category(american_football, case = False, category = '73', bar_text = 'American football')
assign_category(cricket, case = False, category = '74', bar_text = 'Cricket')
assign_category(basketball, case = False, category = '72', bar_text = 'Basketball')
assign_category(baseball, case = False, category = '75', bar_text = 'Baseball')
assign_category(sports, case = False, category = '7', bar_text = 'Sports')


Category Football now has 1644 videos, ellapsed time: 555.22 s
Category American football now has 1349 videos, ellapsed time: 249.56 s
Category Cricket now has 1780 videos, ellapsed time: 231.09 s
Category Basketball now has 3880 videos, ellapsed time: 229.56 s
Category Baseball now has 313 videos, ellapsed time: 222.81 s
Category Sports now has 39428 videos, ellapsed time: 579.06 s


In [39]:
assign_category(['sport'], case = False, category = '79', bar_text = 'Sport', post_b = False, pre_b = False)

Category Sport now has 20814 videos, ellapsed time: 26.08 s


# History

In [40]:
history = [
    "Babylon", "Sumer", "Carthage", "Assyria", "Phoenicia", "Byzantium", "Hittite", "Persian", "Mesopotamia", "Gaul", "Akkadia",  "Etruria", "Elam", "Lydia", "Urartu", "Minoa","Hammurabi", "Nebuchadnezzar", "Gilgamesh", "Ashoka", "Cleopatra",  "Ramses", "Hatshepsut", "Akhenaten", "Tutankhamun", "Charlemagne", 
    "Attila", "Genghis Khan", "Kublai Khan", "Alexander the Great", "Julius Caesar", "Marcus Aurelius", "Constantine", "Harun al-Rashid", "Alaric", "Boudica", "Joan of Arc", "Leonidas", "Pericles", "Theodora", "Hypatia", "Suleiman the Magnificent", "Tamerlane",
    'olympus', 'cyclops', "Zeus", "Poseidon", "Hades", "Apollo", "Athena", "Hera", "Ares", "Hermes", "Artemis", "Dionysus", "Persephone", "Ra", "Isis", 
    "Osiris", "Horus", "Set", "Anubis", "Bastet", "Amaterasu", "Susanoo", "Tsukuyomi", "Odin", "Loki", "Freya",  "Heimdall", "Baldur", "Huitzilopochtli", "Quetzalcoatl", "Tezcatlipoca", 
    "Ixchel", "Chac", "Inti", "Viracocha", "Pachamama", "Tlaloc",    "Troy", "Atlantis", "Nineveh", "Pompeii", "Ecbatana", "Tenochtitlan",     "Machu Picchu", "Palmyra", "Hattusa", "Knossos", "Teotihuacan",    "Angkor", "Petra", "Persepolis", "Mohenjo-Daro",   "Harappa", "Ur", "Susa", "Meroë",
    "Dead Sea Scrolls", "Shroud of Turin", "Terracotta Army", "Golden Fleece", "Ark of the Covenant", "Excalibur", "Holy Grail", 
    "Stonehenge", "Hanging Gardens", "Colossus of Rhodes",  "Library of Alexandria", "Antikythera Mechanism", "Sphinx", "Ziggurat",  "Pantheon", "Acropolis",
    "Paleolithic", "Neolithic", "Bronze Age", "Iron Age", "Antiquity", "Middle Ages", "Renaissance", "Reformation", "Enlightenment", "Industrial Revolution",
    "centurion", "samurai", "shogun",   "vizier", "janissary", "mamluk", "satrap", "praetorian guard", "knight-errant", "caliphate", "crusade",   "reich", "dynasty", "empire",
    "Gilgamesh", "Iliad", "Odyssey", "Aeneid", "Bhagavad Gita",     "Mahabharata", "Ramayana", "Book of the Dead", "Analects",     "Tao Te Ching",  "Summa Theologica",   "Peloponnesian War", 
    "pharaoh", "consul", "tsar", "kaiser", "emir", "viceroy", "archduke",  "shah", "imperator", "thane", " caliph", "czar",
    "Crusade", "Hundred Years. War", "War of the Roses", "Peloponnesian War",    "Pax Romana", "Constantinople", "Great Schism", "Renaissance", "Age of Exploration", "Westphalia", 
    "Meiji Restoration", "Partition of India", "American Revolution",     "French Revolution", "Battle of Hastings", "Black Death","mummy", "feudalism",  "oracle", 
    "Cold War", "Space Race", "Iron Curtain",  "Vietnam War",  "Decolonization",   "Great Depression", "New Deal",  "Roaring Twenties", "World War I", "World War II",  "Post-War Reconstruction", "Marshall Plan", 
    "Warsaw Pact", "Cultural Revolution", "Domino Theory",  "Red Scare", "Nuremberg Trials", "Partition of Palestine", 'holocaust',  "Partition of India", "Nehruvian Era",
    "fascism", "nazism", "apartheid",  "imperialism", "anti-colonialism", 
    "Roosevelt", "Winston Churchill", 'churchill', "Joseph Stalin",'stalin','ussr' "Adolf Hitler", 'hitler', "Mahatma Gandhi",   "Jawaharlal Nehru", "John F. Kennedy",  "Martin Luther King",
    "Che Guevara", "Fidel Castro", "Ho Chi Minh",   "Charles de Gaulle", "Mao Zedong",   "Chiang Kai.shek", "Hirohito", "George Patton",   "Dwight D. Eisenhower", "Nikita Khrushchev", 
    "Leonid Brezhnev", "Harry S. Truman", "Lyndon B. Johnson",  "Richard Nixon", "Golda Meir",
    "Battle of Midway", "Stalingrad","Blitzkrieg", "Operation Overlord", "Pearl Harbor",  "Manhattan Project", "Hiroshima", "Nagasaki",  "Korean War", "Gulf of Tonkin", "Tet Offensive",  "Treaty of Versailles", "Potsdam Conference", 
    "Yalta Conference", "Cuban Missile Crisis",   "Montgomery Bus Boycott",  "Black Panthers", "March on Washington",  "Satyagraha", "Salt March", "Kwame Nkrumah",   "Julius Nyerere", "Patrice Lumumba", "Nelson Mandela",
    "Apollo 11", "Sputnik", "Lunar Module",  "Neil Armstrong", "Buzz Aldrin", "Yuri Gagarin", "Vostok", "Gemini",   "Mercury Program", "Jodrell Bank",     "Berlin Wall", "Checkpoint Charlie", "Hiroshima Peace Memorial",  "Independence Hall", "Selma Bridge",  "Mount Rushmore", "Iwo Jima Memorial",
    "KGB", "Third Reich",  "Axis Powers", "Allied Powers",  "Vietnamese National Front",  "Watergate", "Bay of Pigs", "Rosenberg Trials",   "Hollywood Ten", "Munich Massacre", "My Lai Massacre",     "Chernobyl", "Cuban Revolution",
    "Iron Curtain", "Berlin Airlift", "Moon Landing"]

assign_category(history, case = False, category = '20', bar_text = 'History')

Category History now has 571646 videos, ellapsed time: 1473.85 s


In [41]:
assign_category(['edutainment'], case = False, category = '6', bar_text = 'History', post_b = False, pre_b = False)

Category History now has 960 videos, ellapsed time: 15.94 s


In [42]:
assign_category(['yoga', 'fitness','exercise', 'strength training'], case = False, category = '7', bar_text = 'sport', post_b = False, pre_b = False)

Category History now has 73423 videos, ellapsed time: 98.08 s


In [43]:
spirituality = ['spirituality', "Bible", "Quran", "Torah", "Tanakh", "Vedas", "Upanishads", "Bhagavad Gita", "Talmud", "Hadith", "Tripitaka", "Dhammapada", "Guru Granth Sahib", "Avesta", "Zend-Avesta", "Mormon",
    "Yahweh", "Allah", "Jesus", "Christ", "Buddha", "Krishna", "Shiva", "Vishnu", "Ganesha", "Lakshmi", "Hanuman", "Holy Spirit", "Jehovah", "Adonai", "Elohim", "Rama", "Parvati", "Durga", "Kali", "Ahura Mazda", 
    "Moses", "Muhammad","John the Baptist", "Saint Peter", "Saint Paul", "Saint Augustine", "Saint Francis", "Saint Teresa", "Joan of Arc", "Martin Luther", "Calvin", "Guru Nanak", "Mahavira", "Zoroaster", 
    "Archangels", "Djinn", "Asura", "Yaksha", "Naga", "Rakshasa", "Gandharva", "Deva", "Valkyrie", "Fairy", "Satyr", "Centaur", "Phoenix", 
    "Mecca", "Nazareth", "Mount Sinai", "Mount Zion", "Kailash", "Varanasi", "Lumbini", "Bodh Gaya",  "Amritsar", "Monastery", "Abbey", "Chapel", "Synagogue", "Pagoda", "Stupa", "Cathedral", "Temple", 'shrine'
    "Baptism", "Eucharist", "Confession", "Mass", "Hajj", "Salah", "Fasting", "Meditation", "Puja", "Arti", "Yagna", "Namaz", "Mantra",  "Prayer", "Rosary", "Ordination", "Sacrament",
    "Sin", "Karma", "Dharma", "Nirvana", "Samsara", "Moksha", "Resurrection", "Reincarnation", "Purgatory", "Afterlife", 'meditation', 
    "Crucifix", "Chalice", "Ciborium", "Menorah", "Altar", "Tabernacle", "Tefillin", "Rosary", "Reliquary", "Ramadan", "Eid", "Yom Kippur", "Hanukkah", "Holi", "Vesak", "Lent", "Ash Wednesday", "Good Friday", "Rosh Hashanah", "Navaratri", "Durga Puja", "Thaipusam",
    "Monk", "Nun", "Priest", "Imam", "Rabbi", "Swami", "Yogi", "Shaman", "Mystic", "Hermit", "Sage", "Oracle", "Prophetess", "Pope", "Cardinal", "Bishop", "Caliph", "Mysticism", "Asceticism"
]
assign_category(spirituality, case = False, category = '21', bar_text = 'Spirituality')

Category Spirituality now has 309311 videos, ellapsed time: 707.87 s


# Food and cooking

In [50]:
cooking = ['cooking',"mandoline", "microplane", "spatula", "skillet", "wok", "tagine","tandoor",'tandoori', "colander", "zester", "rolling pin", "pastry", "mezzaluna", "pepper mill", "spoon",
    "ladle", "bench scraper", "peeler", "grater","knife", "cleaver", "butcher block", "blender", "pestle",'mortar', "bain.marie", "Dutch oven","ramekin", "cookie", "cake", "pan",
    "ice cream","sous vide", "food processor","pressure cooker", "rice", "stockpot", "griddle", "piping bag", "salad", "citrus", "melon","pasta", "tamis", "potato", "spice",
    "braising", r"saut.ing", "julienne", "maceration", "tempering", "roux", "beurre", r"blanch\w+", r"deglaz\w+", r"emulsif\w+", "proofing", r"carameliz\w+", 
    r"ferment\w*", r"poach\w*", r"sear\w*", r"marinat\w+", "confit", "curdling", 'jam', r"roast\w*",
    "saffron", "truffle", "sumac", "tamarind", "miso", "kombu", "yuzu", "ponzu", "amaranth", "quinoa", "farro", "freekeh","barberries", "vermouth", "creme fraiche", "mascarpone", "fontina", "pecorino", "burrata", "paneer", "gorgonzola",
    "anchovy", "capers", "kimchi", "gochujang", "furikake", 'parmesan', 'parmiggiano',"nori", "wasabi", "adzuki", "tamari", "mirin","guanciale", "prosciutto", "pancetta", "kefir", "labneh",
    "ghee", "lard", "tallow", "duck fat","ratatouille", "bouillabaisse", "cassoulet", "coq au vin","chateaubriand", "paella", "tapas", "gazpacho", "tortilla",
    "gnocchi", "risotto", "carbonara", "osso buco", "tiramisu","strudel", "goulash", "pierogi", "borscht", "pelmeni", "blini", "baklava", "dolma", "shawarma","tabbouleh", "hummus", "shakshuka", "falafel",
    "kimchi", "bibimbap", "bulgogi", "ramen", "sushi", "sashimi", "tempura", "teriyaki", "udon", "katsu", "pho", "banh mi","som tam", "pad thai", "laksa", "rendang", "naan", "rogan josh", "vindaloo", "dosa", "idli", "sambar", "khichdi", "gulab jamun",
    "pavlova", "lamington", "churros", "arepas","empanadas", "tamales", "ceviche", "asado", "clam", "jambalaya", "gumbo","chow mein", "mapo","tofu", "duck",
    "Gordon Ramsay",'Ramsay', "Julia Child", "Anthony Bourdain", "Ina Garten", "Thomas Keller", "Alice Waters","Ferran Adrià", "Massimo Bottura", "Heston Blumenthal","Wolfgang Puck", "Marco Pierre White", "Alain Ducasse","Joël Robuchon", "Jamie Oliver", "Nigella Lawson",
    "Emeril Lagasse", "Rick Bayless", "David Chang","Ottolenghi", "José Andrés", "Rachel Ray","Paul Bocuse", "Rene Redzepi", "Grant Achatz","Pierre Hermé", "Dominique Ansel",
    "umami", "mise en place", "terroir", "degustation", "artisanal", "locavore", "farm.to.table", "gastronomy", "cuisine", "a la carte", "charcuterie", "patisserie", "fromagerie",
    "boucherie", "rôtisserie", "brasserie", "bistro", 'foodporn',  "stir-fry", "barbecue", r"grill\w*", "broiling",  r"pickl\w+",
    "sommelier", "mixology", "crockpot", 'wine', 'beer', "reduction", "garnish", "quenelle","souffle", "meringue", "ganache", "fondant", "praline"
]
assign_category(cooking, case = False, category = '64', bar_text = 'Cooking and food terms')

Category Cooking and food terms now has 166673 videos, ellapsed time: 1333.46 s


In [None]:
# check food

# Diy and home repair

In [51]:
diy_and_home_repair_terms = ['renovation', 'diy', "jigsaw", "bandsaw", "saw","impact driver", "drill",
    "rotary tool", "multitool", r"caulk\w*", "stud finder", "grinder", "sander", "tile cutter", "paint sprayer", "lathe",
    "chisel",  "brick trowel", "tape measure", "awl", "workbench", "vise", "planer", "utility knife", "wire stripper",
    r"solder\w*", "shop vac", "dust extractor", "power washer", "cable tester", "stud extractor", "wrench",
    "hacksaw",  "combination square", "masonry",  "countersink", "plywood", "MDF", "OSB", "drywall", "cement",
    "hardwood", "softwood", r"veneer\w*", "epoxy", "polyurethane", "wood filler", "construction adhesive", 
    "mortar", "silicone sealant", "weatherstripping", "beadboard","crown molding", "baseboard", "dowel rod", "rebar", "paver", "decking", "insulation", "shingle", "underlayment", "PEX pipe", "angle iron", 
    "chipboard", "wainscoting", "stucco", "toggle bolt", "lag screw", "wood screw", "deck screw", "finishing nail", "brad nail", "carriage bolt", "joist hanger",
    "hinge pin", "washer", "split ring", "cotter pin", "cable clamp",
    "upcycling", "decoupage", "distressing", "mortise and tenon", "dovetail joint", "miter joint", "rabbet joint", "lamination", 
    r"varnish\w*", r"weld\w*", r"plaster\w*", "mudding", r"grout\w*", "tuckpointing", "tiling", "etching",  
    r"lacquer\w*", "raised garden bed", "DIY greenhouse", 'diy '
    "Kreg Jig", "Makita", "DeWalt", "Ryobi", "Bosch", "Festool", r"Porter.Cable", "Craftsman", r"Black.Decker", "Fiskars",
    "Simpson Strong-Tie", "Loctite", "Titebond", "Gorilla Glue", "Liquid Nails", "Rust-Oleum", "Minwax", "3M Command", "ScotchBlue", "DAP", "Quickrete",
    "drop cloth",  "tarpaulin", "wood glue",  "masking tape","sandpaper", 
    "putty", "tile spacer", "grout \w+", "chalk line", "work gloves", "tool belt"]

assign_category(diy_and_home_repair_terms, case = False, category = '65', bar_text = 'DIY and home repair terms')

Category DIY and home repair terms now has 35147 videos, ellapsed time: 699.22 s


# Crypto

In [52]:
crypto = [
    # Popular Cryptocurrencies
    "Bitcoin", "Ethereum", "Tether", "Binance Coin", "Cardano", "Solana", "XRP", "Polkadot", "Litecoin", "Chainlink","Avalanche", "Dogecoin", "Shiba Inu", "Stellar", "Monero", "Zcash",  "Uniswap", "Aave", "Compound", "PancakeSwap", 
    "MakerDAO", "Synthetix","Algorand", "Filecoin", "Arbitrum","Elrond", "VeChain",  "Decentraland", "Sandbox","Axie Infinity", "Enjin", "Zilliqa", "Arweave",
    # DeFi (Decentralized Finance)
    "Yield Farming", "Liquidity Pool", "Impermanent Loss","Automated Market Maker", "Decentralized Exchange", "Stablecoin","Collateralized Debt Position", "Flash Loan","Synthetic Asset", "Decentralized Autonomous Organization","Liquidity Mining", "Tokenomics",
    # NFT (Non-Fungible Tokens) and Gaming
    "Non-Fungible Token", # only from 2017
    # Cryptography and Technology
    "Blockchain", "Hash Function", "SHA-256", "Keccak", "Elliptic Curve Cryptography", "Digital Signature", "Public-Private Key Pair", "Distributed Ledger", 
    # Popular Platforms and Tools
    "Coinbase", "Kraken", "Metamask","Trust Wallet", "OpenSea", "Rarible", "Etherscan", "Block Explorer", "CoinMarketCap", "CoinGecko",
]

assign_category(crypto, case = False, category = '25', bar_text = 'Cryptocurrencies')

Category Cryptocurrencies now has 23071 videos, ellapsed time: 361.03 s


In [53]:
len(edu[edu.category != 'unclass'])   
# print the categories
# edu.category.unique()
# edu[edu.category == ''].head(20)

1817955

In [None]:
spirituality = ['spirit', 'psalm', 'mass', 'requiem', 'jesus', 'mohammed', 'prophet', 'bible']

In [54]:
save_df = edu[['display_id', 'channel','name_cc', 'category_cc', 'category']]
save_df.to_csv(op.join(path_deriv, 'subcategories_1512_21h.csv'), index = False)