In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import subprocess
from pathlib import Path

PROJECT_NAME = "0223-crashcourse_1GB"
PREFIX = Path.home() / "zims"
CACHE_DIR = PREFIX / PROJECT_NAME / "youtube" / "cache"
TARGET_SIZE = 1000000000  # 1GB
BASE_DIR = PREFIX / PROJECT_NAME
WORKING_DIR = PREFIX / PROJECT_NAME / "working"
PROJECT_DIR = PREFIX / PROJECT_NAME / "tree"
OUTPUT_DIR = PREFIX / PROJECT_NAME / "output_tree"
SOURCE_DIR = PREFIX / PROJECT_NAME / "zim-src"
NEW_ZIM_DIR = PREFIX / PROJECT_NAME / "new-zim"
PROOF_DIR = PREFIX / PROJECT_NAME / "proof"

directories = [OUTPUT_DIR, PROJECT_DIR, CACHE_DIR / "video_json", SOURCE_DIR, NEW_ZIM_DIR, PROOF_DIR, WORKING_DIR]
for directory in directories:
    directory.mkdir(parents=True, exist_ok=True)

source_url = "/library/zims/content/crashcourse_en_all_2023-02.zim"

try:
    ZIM_PATH = (SOURCE_DIR / next(iter(SOURCE_DIR.glob("*")))).resolve()
    print(ZIM_PATH)
except StopIteration:
    if source_url.startswith("/library/zims/"):
        (SOURCE_DIR / source_url.split("/")[-1]).symlink_to(source_url)
    else:
        subprocess.run(f"wget -P {SOURCE_DIR} {source_url}", shell=True)
        ZIM_PATH = SOURCE_DIR / source_url.split("/")[-1]

if not ZIM_PATH.exists():
    print(f"ZIM file at path {ZIM_PATH} not found. Exiting.")
    exit(1)


In [None]:
print(f'{PREFIX},{PROJECT_DIR}')

In [None]:
# First we need to get a current copy of the script
FACTORY_REPO = '/opt/iiab/iiab-factory/'
cmd = '/bin/cp %s/content/kiwix/zim-filter/de-namespace.sh %s'%(FACTORY_REPO,PREFIX)
subprocess.run(cmd,shell=True)

In [None]:
# The following command will zimdump to the "tree" directory
#  Despite the name, removing namespaces seems unnecessary, and more complex
# It will return without doing anything if the "tree' is not empty
print('Using zimdump to expand the zim file to %s'%BASE_DIR)
progname = str(PREFIX) + "/de-namespace.sh"
cmd = "%s %s %s"%(progname,ZIM_PATH, BASE_DIR)
print('command:%s'%cmd)
subprocess.run(cmd,shell=True)

In [None]:
# WARNING: Please wait until the script is done running before moving to the next cell 
# to avoid "duration name not defined" error.
# Orange hour glass icon in your browser tab means the script is running.

# use yt_dlp instead of youtube_dl to avoid the script being "stopped" because of age restrictions
import os
import json
from yt_dlp import YoutubeDL
ydl_opts = {
    'ignoreerrors': True
}
ydl = YoutubeDL(ydl_opts)
print('Downloading metadata from Youtube')
downloaded = 0
skipped = 0
# Create a list of youtube id's
yt_id_list = os.listdir(str(PROJECT_DIR) + '/videos/')
for yt_id in iter(yt_id_list):
    if os.path.exists(str(CACHE_DIR) + '/video_json/' + yt_id + '.json'):
        # skip over items that are already downloadd
        skipped += 1
        continue
    with ydl:
       result = ydl.extract_info(
                'http://www.youtube.com/watch?v=%s'%yt_id,
                download=False # We just want to extract the info
                )
       downloaded += 1

    with open(str(CACHE_DIR) + '/video_json/' + yt_id + '.json','w') as fp:
        fp.write(json.dumps(result))
    #pprint.pprint(result['upload_date'],result['view_count'])
print('%s skipped and %s downloaded'%(skipped,downloaded))

In [None]:
# this is to
def get_assets_data():
    # the file <root>/assets/data.js holds the category to video mappings
    outstr = '['
    data = {}
    with open(PROJECT_DIR / 'assets/data.js', 'r') as fp:
    #with open(OUTPUT_DIR + '/assets/data.js', 'r') as fp:
        line = fp.readline()
        while True:
            if line.startswith('var') or not line :
                if len(outstr) > 3:
                    # clip off the trailing semicolon
                    outstr = outstr[:-2]
                    try:
                        data[cat] = json.loads(outstr)
                    except Exception:
                        print('Parse error: %s'%outstr[:80])
                        exit
                cat = line[9:-4]
                outstr = '['
                if not line: break
            else:
                outstr += line
            line = fp.readline()
    return data

zim_category_js = get_assets_data()
# print(json.dumps(zim_category_js,indent=2))
def get_zim_data(yt_id):
    rtn_dict = {}
    for cat in  iter(zim_category_js.keys()):
        for video in range(len(zim_category_js[cat])):
            if zim_category_js[cat][video]['id'] == yt_id:
                rtn_dict = zim_category_js[cat][video]
                break
        if len(rtn_dict) > 0: break
    return rtn_dict
# ans = get_zim_data('usdJgEwMinM')
# print(json.dumps(ans,indent=2))

In [None]:
# we need to ensure we have pymediainfo package installed inside the venv (pip3 install pymediainfo)
# pymediainfo won't work coorectly if library libmediainfo0v5 is not installed (apt install libmediainfo0v5)
from pprint import pprint
from pymediainfo import MediaInfo

def mediainfo_dict(path):
    try:
        minfo = MediaInfo.parse(path)
    except:
        print('mediainfo_dict. file not found: %s'%path)
        return {}
    return minfo.to_data()
def select_info(path):
    global data
    data = mediainfo_dict(path)
    rtn = {}
    for index in range(len(data['tracks'])):
        track = data['tracks'][index]
        if track['kind_of_stream'] == 'General':
            rtn['file_size'] = track.get('file_size',0)
            rtn['bit_rate'] = track.get('overall_bit_rate',0)
            rtn['time'] = track['other_duration'][0]
        if track['kind_of_stream'] == 'Audio':
            rtn['a_stream'] = track.get('stream_size',0)
            rtn['a_rate'] = track.get('maximum_bit_rate',0)
            rtn['a_channels'] = track.get('channel_s',0)
        if track['kind_of_stream'] == 'Video':
            rtn['v_stream'] = track.get('stream_size',0)
            rtn['v_format'] = track['other_format'][0]
            rtn['v_rate'] = track.get('bit_rate',0)
            rtn['v_frame_rate'] = track.get('frame_rate',0)
            rtn['v_width'] = track.get('width',0)
            rtn['v_height'] = track.get('height',0)
    return rtn

In [None]:
# the database created here can be found at ../working directory
import sqlite3
class Sqlite():
   def __init__(self, filename):
      self.conn = sqlite3.connect(filename)
      self.conn.row_factory = sqlite3.Row
      self.conn.text_factory = str
      self.c = self.conn.cursor()
    
   def __del__(self):
      self.conn.commit()
      self.c.close()
      del self.conn

def get_video_json(path):
    with open(path,'r') as fp:
        try:
            jsonstr = fp.read()
            #print(path)
            modules = json.loads(jsonstr.strip())
        except Exception as e:
            print(e)
            print(jsonstr[:80])
            return {}
    return modules

def video_size(yt_id):
    return os.path.getsize(PROJECT_DIR + '/videos/' + yt_id + '/video.webm')

def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def download_file(url,todir):
    local_filename = url.split('/')[-1]
    r = requests.get(url)
    f = open(todir + '/' + local_filename, 'wb')
    for chunk in r.iter_content(chunk_size=512 * 1024):
        if chunk:
            f.write(chunk)
    f.close()
    
from datetime import datetime
def age_in_years(upload_date):
    uploaded_dt = datetime.strptime(upload_date,"%Y%m%d")
    now_dt = datetime.now()
    days_delta = now_dt - uploaded_dt
    years = days_delta.days/365 + 1
    return years

In [None]:
# erase the sql database (if already present inside ../working directory) if you want this script to reflect 
# the latest data
def initialize_db():
    sql = 'CREATE TABLE IF NOT EXISTS video_info ('\
            'yt_id TEXT UNIQUE, zim_size INTEGER, view_count INTEGER, age INTEGER, '\
            'views_per_year INTEGER, upload_date TEXT, duration TEXT, '\
            'height INTEGER, width INTEGER,'\
            'bit_rate TEXT, format TEXT, '\
            'average_rating REAL,slug TEXT,title TEXT)'
    db.c.execute(sql)
    
print('Creating/Updating a Sqlite database with information about the Videos in this ZIM.')
db = Sqlite(str(WORKING_DIR) + '/zim_video_info.sqlite')
initialize_db()
sql = 'select count() as num from video_info'
db.c.execute(sql)
row = db.c.fetchone()
if row[0] == len(yt_id_list):
    print('skipping update of sqlite database. Number of records equals number of videos')
else:
    for yt_id in iter(yt_id_list):
        # some defaults
        age = 0
        views_per_year = 1
        # fetch data from assets/data.js
        zim_data = get_zim_data(yt_id)
        if zim_data is not None:
            if len(zim_data) == 0: 
                print('get_zim_data returned no data for %s'%yt_id)
        slug = zim_data['slug']

        # We already have youtube data for every video, use it 
        data = get_video_json(str(CACHE_DIR) + "/video_json/" + yt_id + '.json')
        if data is not None:
            if len(data) == 0:
                print('get_video_json returned no data for %s'%yt_id)
            vsize = data.get('filesize',0)
            view_count = data.get('view_count',0)
            upload_date = data.get('upload_date','')
            average_rating = data.get('average_rating',0)
            title = data.get('title','unknown title')
        # calculate the views_per_year since it was uploaded
        if upload_date != '':
            age = round(age_in_years(upload_date))
            views_per_year = int(view_count / age)

        # interogate the video itself
        filename = str(PROJECT_DIR) + '/videos/' + yt_id + '/video.webm'
        if os.path.isfile(filename):
            vsize = os.path.getsize(filename)
            #print('vsize:%s'%vsize)
            selected_data = select_info(filename)
            if len(selected_data) == 0:
                duration = "not found"
                bit_rate = "" 
                v_format = ""
            else:
                duration = selected_data['time']
                bit_rate = selected_data['bit_rate']
                v_format = selected_data['v_format']
                v_height = selected_data['v_height']
                v_width = selected_data['v_width']

        # colums names: yt_id,zim_size,view_count,views_per_year,upload_date,duration,
        #         bit_rate, format,average_rating,slug
        sql = 'INSERT OR REPLACE INTO video_info VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
        db.c.execute(sql,[yt_id,vsize,view_count,round(age),views_per_year,upload_date, \
                          duration,v_height,v_width,bit_rate,v_format,average_rating,slug,title, ])
    db.conn.commit()
    print(yt_id,vsize,view_count,views_per_year,upload_date, \
                          duration,bit_rate,v_format,average_rating,slug,round(age))

In [None]:
# this will output a table of all the videos sorted by views per year
import pandas as pd
from IPython.display import display 
global tot_sum

def human_readable(num):
    # return 3 significant digits and unit specifier
    num = float(num)
    units = [ '','K','M','G']
    for i in range(4):
        if num<10.0:
            return "%.2f%s"%(num,units[i])
        if num<100.0:
            return "%.1f%s"%(num,units[i])
        if num < 1000.0:
            return "%.0f%s"%(num,units[i])
        num /= 1024.0

sql = 'select slug,zim_size,views_per_year,view_count,duration,upload_date,'\
       'bit_rate from video_info order by views_per_year desc'
tot_sum = 0
db.c.execute(sql)
rows = db.c.fetchall()
row_list = []
boundary_views_per_year = 0
for row in rows:
    tot_sum += row['zim_size']
    row_list.append([row['slug'][:60],human_readable(row['zim_size']),\
                              human_readable(tot_sum),human_readable(row['view_count']),\
                              human_readable(row['views_per_year']),\
                              row['upload_date'],row['duration'],row['bit_rate']])
    if tot_sum > TARGET_SIZE and boundary_views_per_year == 0:
        boundary_views_per_year = row['views_per_year']
sql = 'select slug,zim_size,views_per_year,view_count,duration,upload_date,'\
       'format,width,height,bit_rate from video_info order by views_per_year desc'
db.c.execute(sql)
rows = db.c.fetchall()
print('%60s %6s %6s %6s %6s %8s %8s'%('Name','Size','Sum','Views','Views','Date  ','Duration'))
print('%60s %6s %6s %6s %6s %8s %8s'%('','','','','/ yr','',''))
tot_sum = 0
for row in rows:
    tot_sum += row['zim_size']
    print('%60s %6s %6s %6s %6s %8s %8s'%(row['slug'][:60],human_readable(row['zim_size']),\
                              human_readable(tot_sum),human_readable(row['view_count']),\
                              human_readable(row['views_per_year']),\
                              row['upload_date'],row['duration']))
#df = pd.read_sql(sql,db.conn)
#df = pd.DataFrame(row_list,columns=['Name','Size','Sum','Views','Views','Date','Duration','Bit Rate'])
#display(df)

In [None]:
print('We will include videos with views_per_year greater than %s'%boundary_views_per_year)
wanted_ids = []
sql = 'SELECT yt_id, title from video_info where views_per_year > ?'
db.c.execute(sql,[boundary_views_per_year,])
rows = db.c.fetchall()
#for row in rows:
   # print(row['yt_id'])
 #   wanted_ids.append(row['yt_id'])

with open(str(NEW_ZIM_DIR) + '/wanted_list.csv','w') as fp:
    for row in rows:
        fp.write('%s,%s\n'%(row['yt_id'],row['title'],))
        wanted_ids.append(row['yt_id'])
        print(row['yt_id'])
  #  with open(HOME + '/zimtest/' + PROJECT_NAME + '/wanted_list.csv','w') as fp:
#    for row in rows:
#        fp.write('%s,%s\n'%(row['yt_id'],row['title'],))

In [None]:
import shutil
# copy the default top level directories (these were in the zim's "-" directory - old zim format)
print('Copying wanted folders and Videos to %s'%OUTPUT_DIR)
cpy_dirs = ['assets','cache','channels']
for d in cpy_dirs:
    shutil.rmtree(os.path.join(OUTPUT_DIR,d),ignore_errors=True)
    os.makedirs(os.path.join(OUTPUT_DIR,d))
    src = os.path.join(PROJECT_DIR,d)
    dest = os.path.join(OUTPUT_DIR,d)
    shutil.copytree(src,dest,dirs_exist_ok=True, symlinks=True)
    print(dest)

In [None]:
# Copy the videos selected by the wanted_ids list to output file
import shutil
for f in wanted_ids:
    if not os.path.isdir(os.path.join(OUTPUT_DIR,'videos',f)):
        os.makedirs(os.path.join(OUTPUT_DIR,'videos',f))
        src = os.path.join(PROJECT_DIR,'videos',f)
        dest = os.path.join(OUTPUT_DIR,'videos',f)
        shutil.copytree(src,dest,dirs_exist_ok=True)

In [None]:
#  Copy the files in the root directory
import shutil
for yt_id in wanted_ids:
    # print(yt_id)
    map_index_to_slug = get_zim_data(yt_id)
    # print(len(map_index_to_slug))
    if len(map_index_to_slug) > 0:
        title = map_index_to_slug['slug']
        src = os.path.join(PROJECT_DIR,title + '.html')
        # print(src)
        dest = str(OUTPUT_DIR) + '/' + title + '.html'
        if os.path.isfile(src) and not os.path.isfile(dest):
            shutil.copyfile(src,dest)
           # print('moving:', src)
        else:
            print('src:%s'%src)

In [None]:
# There are essential files that are needed in the zim
needed = ['/favicon.jpg','/home.html','/profile.jpg','/banner.jpg','/metadata.json']
for f in needed:
    cmd = '/bin/cp %s %s'%(str(PROJECT_DIR)  + f,str(OUTPUT_DIR))
    subprocess.run(cmd,shell=True)

In [None]:
# Write a new mapping from categories to videos (with some removed)
print('Creating a new mapping from Categories to videos within each category.')
outstr = ''
for cat in zim_category_js:
    outstr += 'var json_%s = [\n'%cat
    for video in range(len(zim_category_js[cat])):
        if zim_category_js[cat][video].get('id','') in wanted_ids:
            outstr += json.dumps(zim_category_js[cat][video],indent=1)
            outstr += ','
    outstr = outstr[:-1]
    outstr += '];\n'
with open(str(OUTPUT_DIR) + '/assets/data.js','w') as fp:
    fp.write(outstr)
    


In [None]:
# install zimscraperlib (pip3 install zimscraperlib) and libzim library before running this
print('Creating a new ZIM and Indexing it')

import os
from pathlib import Path
from zimscraperlib.zim import make_zim_file
from glob import glob
from datetime import datetime

# original_name = "climate_and_energy_2023-02.zim"
# print(original_name)
fname = "0223-crashcourse_en_top1g_2023-02.zim"
print('fname:%s'%fname)
#sys.exit(1)

os.chdir(OUTPUT_DIR)
if not os.path.isfile(os.path.join(NEW_ZIM_DIR,fname)):
    make_zim_file(
        build_dir=OUTPUT_DIR,
        fpath=NEW_ZIM_DIR / fname,
        name=fname,
        main_page="home.html",
        favicon="favicon.jpg",
        title="CrashCourse",
        description="CrashCourse top videos (1GB)",
        language="en",
        creator="CrashCourse",
        publisher="Internet in a Box",
        tags="test",
        scraper="zimscraperlib",
            )