In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import sys
import os.path as op
import re

sys.path.insert(0, '.') # add to path the current folder to use relative paths
path_data = op.join('data', 'raw')
path_metadata = op.join(path_data, "yt_metadata_en.jsonl.gz")
path_channels = op.join(path_data, "df_channels_en.tsv.gz")
path_deriv = op.join(path_data, '..', 'derivatives')
path_edu = op.join(path_deriv, "Education_videos_{}.csv")
path_edu_clean = op.join(path_deriv, "Education_videos_{}clean.csv")

## Channel data 

In [3]:
df_channels = pd.read_csv(path_channels, compression="infer", sep="\t")
df_channels["join_date"] = pd.to_datetime(df_channels["join_date"])

In [4]:
df_edu = df_channels.loc[df_channels['category_cc'].isin(['Education'])]
df_edu.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
93193,Education,2013-03-11,UCa9r4ivs4mB3Qt2M2uODpEA,JP Gloria,15100,86,491113.0,10.1095
132100,Education,2007-10-11,UCwDlrvXecueebBA4vAwrDQQ,SCAD - The Savannah ...,10700,438,884463.0,11.939
14488,Education,2011-07-07,UC5bQ6WD_2NLGbfeJYIwAIuA,INKtalks,427021,480,40267.0,3.3105
87586,Education,2016-06-03,UChz_jz50dnm5IlYn6GQHAqg,kristyglassknits,26500,815,443193.0,7.238
90606,Education,2009-07-11,UCrBzGHKmGDcwLFnQGHJ3XYg,giant_neural_network...,24400,41,469286.0,8.0505


## Extraction youtube metadata (legacy - see chunk_video_metadata)

In [None]:
import random
import json
import gzip

def filter_jsonl(input, category, batch_size, random_seed, all = False):
    filtered_data = []
    random.seed(random_seed)
    counter = 0
    with gzip.open(input, 'rt', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)

            if entry.get('categories') == category:
                counter +=1
                if len(filtered_data) <= batch_size or all:
                    filtered_data.append(entry)
                else:
                    index_to_replace = random.randint(0, len(filtered_data) - 1)
                    if index_to_replace < batch_size:
                        filtered_data[index_to_replace] = entry
      
                
    print(f"There are {counter} videos in the Education category!")
    return pd.DataFrame(filtered_data)

            

df = filter_jsonl(path_metadata, 'Education', 500000, 0, all = False)
df.to_csv(op.join(path_deriv, 'df_edu_all.csv'))

There are 3795564 videos in the Education category!


## Preprocessing of video descriptions

In [None]:
i = 0 # iterate over the 9 files
df = pd.read_csv(op.join(path_deriv, path_edu.format(i)), index_col=0)

# sort the description separately since way more unnecessary text
# additionaly some titles contain keywords we remove in descriptions
df['desc_clean'] = df['description'].fillna('')
df['text'] = df['title'].fillna('') + " " + df['tags'].fillna('') 

#### pipeline

In [6]:
# everything to lower case so that same words are treated the same
df['text'] = df['text'].apply(lambda x: x.lower() if isinstance(x,str) else x)
df['desc_clean'] = df['desc_clean'].apply(lambda x: x.lower() if isinstance(x,str) else x)

In [7]:
# remove urls links and their associated text
url_pattern = re.compile(r'(https?://\S+|www\.\S+)') 
com_pattern = re.compile(r'([^\s]+\.com)')
fb_pattern = re.compile(r'(facebook\s+page|facebook\s+group)[^\w\s]*.*?(\n|$)')
#lines that start with brands or websites 
link_pattern = re.compile(r'(\n+)(facebook|twitter|pinterest|tumblr|instagram|website|amazon)[^\w\s]*\s+\S+')
link_pattern2 = re.compile(r'(\n+)(facebook|twitter|pinterest|tumblr|instagram|website|amazon)')

#TODO lines that contain these websites in the middle of text - hard to implement since not sure if discard

long_words = re.compile(r'\b[a-zA-Z0-9]{21,}\b') # most words in english are below 20 letters, bigger than that is a crypto wallet id

def remove_urls(text, desc = False):
        text = url_pattern.sub('', text)
        text = com_pattern.sub('', text)
        text = long_words.sub('', text)
        if desc: # titles and tags should keep brand names since these might be the focus of the video
                text = fb_pattern.sub('', text)
                text = link_pattern.sub('', text)
                return link_pattern2.sub('', text)
        return text

df['desc_clean'] = df['desc_clean'].apply(remove_urls, desc = True)
df['text'] = df['text'].apply(remove_urls)
#test = 'https://www.youtube.com/everydaytacticalvids\n my twitter account - https://twitter.com/everydaytactic1\n my facebook group tha tha \n ow'
#remove_urls(test, True)

In [8]:
def clean_non_word(text): #punctuation, underscores seem to evade this regex so add it
        return  re.sub(r'[^\w\s]|_+', ' ', text)

def clean_non_ascii(text): # indian symbols that might still be left
        return  re.sub(r'[^\x00-\x7F]+', ' ', text)

df['desc_clean'] = df['desc_clean'].apply(clean_non_word).apply(clean_non_ascii)
df['text'] = df['text'].apply(clean_non_word).apply(clean_non_ascii)

In [9]:
def clean_numeric(text):
    return re.sub(r'\d+(?![a-zA-Z])', '', text) # numbers that are not accolated to strings : # TODO MAybe all numbers
df['desc_clean'] = df['desc_clean'].apply(clean_numeric)
df['text'] = df['text'].apply(clean_numeric)

In [10]:
def clean_space_newline(text):
    return re.sub(r'\s{2,}', ' ', text.replace('\n', ' ')).strip()

df['desc_clean'] = df['desc_clean'].apply(clean_space_newline)
df['text'] = df['text'].apply(clean_space_newline)

In [None]:
# Recombine both 
df['text_clean'] = df['text'] + " " + df['desc_clean']
df = df.drop(['text', 'desc_clean'], axis = 1)
df.to_csv(op.join(path_deriv, path_edu_clean.format(i)))

In [None]:
#optional : make the dataframe smaller 
df = df.drop(['description'], axis = 1)

##### debug

In [None]:
#df[df['title'].str.contains('twitter')]['title']

"""string = df['description'].iloc[5046].lower()
print(string)
remove_urls(string)#.replace('website', 'website blab bla truc muche promotion')"""

In [None]:
"""pd.set_option('display.max_colwidth', 80)
df.sample(5)[['desc_clean', 'text']]"""
# some examples of problematic data
# 32379 crypto video with lots of links
# 35962 marseillaise
# 22678 41830 non alpha and indian
# 34547 numbers accolated
# 17337 very long strings in crypto vids
# 3235 hindu

# Categorize subtopics - BERT (legacy see code Fred)

In [None]:
# no need to perform lemmatization, stemming or stopword removal since BERT handles it 
#df= pd.read_csv(op.join(path_deriv, 'df_edu_500k_clean.csv'))
df= pd.read_csv(op.join(path_deriv, path_edu_clean.format(i)))

In [None]:
trial = df.copy().sample(50000)
pd.set_option('display.max_colwidth', 80)
trial = trial.reset_index(drop=True)
trial.head(5)

In [None]:
# Try to extract sub topics with sentence transformers like BERT 
from sentence_transformers import SentenceTransformer # https://sbert.net/
from sklearn.cluster import KMeans

print('1. Encoding model...')
model = SentenceTransformer('all-MiniLM-L6-v2') 
embeddings = model.encode(trial.get('text_clean'))
np.save(op.join(path_deriv, 'embeddings.npy'), embeddings) # 41 mins 

1. Encoding model...


In [20]:
print('2. Starting kmeans...')
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters, random_state=0)

2. Starting kmeans...


In [None]:
print('3. Fitting k means...') # 10s
clusters = kmeans.fit(embeddings)
cluster_assignment = clusters.labels_
print('4. Predicting labels...')
trial['cluster'] = clusters.predict(embeddings)

3. Fitting k means...


  File "c:\Users\gbrag\miniconda3\envs\ada\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [None]:
for cluster_num in range(num_clusters):
    print(f"Cluster {cluster_num}")
    cluster_data = trial[trial['cluster'] == cluster_num]
    display(cluster_data.sample(5)[['title', 'text_clean', 'tags']])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

cluster_dict = {}

for cluster in range(kmeans.n_clusters): #
    texts = [row['text_clean'] for _, row in trial.iterrows() if row['cluster'] == cluster]

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(texts)

    lda = LatentDirichletAllocation(n_components=1, random_state=1) # only one category per cluster
    lda.fit(tfidf_matrix)
    
    feature_names = tfidf.get_feature_names_out()
    top_idx = lda.components_[0].argsort()[-3:][::-1]
    
    terms = [feature_names[i] for i in top_idx]
    cluster_dict[cluster] = '.'.join(terms)
    print(f"Cluster {cluster}, {len(texts)} videos: {'.'.join(terms)}")

Cluster 0, 121 videos: asl.sign.language
Cluster 1, 1601 videos: ca.gk.learning
Cluster 2, 1833 videos: trp.tv.live
Cluster 3, 1137 videos: video.youtube.use
Cluster 4, 451 videos: sunday.adelaja.pastor
Cluster 5, 867 videos: tedx.ted.organized
Cluster 6, 1231 videos: money.marketing.online
Cluster 7, 1199 videos: maths.examsolutions.math
Cluster 8, 659 videos: food.bajias.cooking
Cluster 9, 1161 videos: garden.house.gardening
Cluster 10, 1388 videos: kids.cbeebies.learn
Cluster 11, 956 videos: rhymes.songs.nursery
Cluster 12, 2153 videos: excel.data.video
Cluster 13, 1403 videos: english.learn.japanese
Cluster 14, 1263 videos: dr.health.medical
Cluster 15, 975 videos: life.health.mental
Cluster 16, 1331 videos: news.video.use
Cluster 17, 984 videos: business.management.iese
Cluster 18, 1145 videos: chess.history.course
Cluster 19, 1620 videos: tips.remedies.awesome
Cluster 20, 287 videos: san.diego.air
Cluster 21, 1004 videos: survival.asp.knife
Cluster 22, 841 videos: trading.day.mar

In [None]:
trial['cluster_name'] = trial['cluster'].map(cluster_dict)
trial.to_csv(op.join(path_deriv, 'trial_clustered_50k.csv'))

In [None]:
unique, counts = np.unique(kmeans.labels_, return_counts=True)
counts

array([ 121, 1601, 1833, 1137,  451,  867, 1231, 1199,  659, 1161, 1388,
        956, 2153, 1403, 1263,  975, 1331,  984, 1145, 1620,  287, 1004,
        841, 1019, 1265,  981, 1286, 1125,  993,  626,  412,  843, 1465,
        109,  872, 1116,  420,  467, 1737, 1382, 1186, 1257,  545,  756,
        599, 1215,  628,  324,  890,  872], dtype=int64)