In [1]:
# -*- coding: utf-8 -*-
import sys; print('Python %s on %s' % (sys.version, sys.platform))
import os
import time
import json
import pandas as pd
import numpy as np
from glob import glob, iglob
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt

Python 3.8.10 (default, Mar 13 2023, 10:26:41) 
[GCC 9.4.0] on linux


## prepare for the library and category

In [2]:
dict_category = {
    'general': ('ede_md', 'ede_url.csv'),
    'cop': ('ICE DE CoP_md', 'ICE DE CoP_url.csv'),
    'development': ('ICE DE Development_md', 'ICE DE Development_url.csv'),
    'support': ('ICE DE Support Model_md', 'ICE DE Support Model_url.csv'),
    'incident': ('Incidents and Lessons Learned_md', 'Incidents and Lessons Learned_url.csv'),
}

In [3]:
import utils
import chunk
from chunk import MAX_TOKEN

LIBRARY = 'ede-20230605'
# CATEGORY = 'ede_md'

In [4]:
results = []
for category, values in dict_category.items():
    DATASET_FOLDER = f'./dataset/{LIBRARY}/{values[0]}'
    data_source = sorted(iglob(DATASET_FOLDER + '**/**/*.md', recursive=True))

    df_link = pd.read_csv(f'./dataset/{LIBRARY}/{values[1]}', index_col='title')
    dict_link = { idx: item.url.lower() for idx, item in df_link.iterrows() }
    print(category, values, len(data_source), len(dict_link))
    
    for file_path in tqdm(data_source):
        try:
            plain_text = utils.preprocess_markdown_file(file_path)
            file_names = file_path.replace(DATASET_FOLDER, '').split('/')
            file_name = file_names[-1]
            external_link = dict_link[file_name]

            contents = list(chunk.recursive_split_by_heading(plain_text))
            contents = chunk.merge_short_paragraphs(contents, max_token=int(MAX_TOKEN*0.8), direction="bottom-up")
            contents = [x for x in contents if len(x.strip()) > 0]

            for idx, content in enumerate(contents):     
                results.append({
                    "library": LIBRARY,
                    "category": category,
                    "checksum": utils.generate_checksum(file_path + content),
                    "file_name": file_name,
                    "file_path": file_path,
                    "external_link": external_link,
                    "chapter": idx+1,
                    "text_length": len(content),
                    "token_count": utils.num_tokens_from_string(content),
                    "content": content
                })
        except:
            print(file_path)
            raise

general ('ede_md', 'ede_url.csv') 152 152


  return BeautifulSoup(text, 'html.parser').get_text()
100%|██████████| 152/152 [00:03<00:00, 39.83it/s]


cop ('ICE DE CoP_md', 'ICE DE CoP_url.csv') 13 13


100%|██████████| 13/13 [00:00<00:00, 64.67it/s]


development ('ICE DE Development_md', 'ICE DE Development_url.csv') 59 59


100%|██████████| 59/59 [00:01<00:00, 47.72it/s]


support ('ICE DE Support Model_md', 'ICE DE Support Model_url.csv') 6 6


100%|██████████| 6/6 [00:00<00:00, 27.77it/s]


incident ('Incidents and Lessons Learned_md', 'Incidents and Lessons Learned_url.csv') 28 28


100%|██████████| 28/28 [00:00<00:00, 44.28it/s]


In [5]:
df_meta = pd.DataFrame(results)
df_meta = df_meta.set_index('checksum')
df_meta = df_meta.rename_axis('idx')
df_sorted = df_meta.sort_values(by='file_path', ascending=False)

In [6]:
df_meta.head(5)

Unnamed: 0_level_0,library,category,file_name,file_path,external_link,chapter,text_length,token_count,content
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5321df80aa9ea2d59e5f8f24b7d47018c0e0ea05a84b28287ed13096577ffc1f,ede-20230605,general,about.md,./dataset/ede-20230605/ede_md/about.md,https://docs.live.eureka.tools.sap/sre/about,1,607,123,# SITE RELIABILITY ENGINEERING(SRE)\n \n!!! wa...
fd4123b6c6da924cb5dfa0d4a8450b29d31c8e7f53c1cb9416408efe7af1f385,ede-20230605,general,access-acr.md,./dataset/ede-20230605/ede_md/access-acr.md,https://docs.live.eureka.tools.sap/how-to/acce...,1,1555,427,# Accessing ACR\n\nWe are now using ACR(Azure ...
3b067677bca0eab603ee31ff3a9b482e9c2614fb8bea8ecc2d967eb715135032,ede-20230605,general,add-a-secret.md,./dataset/ede-20230605/ede_md/add-a-secret.md,https://docs.live.eureka.tools.sap/how-to/sett...,1,1658,427,# Adding a Secret to the CI Pipeline\n\nIf you...
7e01977015a7c6be55b97ac69db22247fe724bd2eaa4e2fe3340e9865d756092,ede-20230605,general,admission-webhook.md,./dataset/ede-20230605/ede_md/admission-webhoo...,https://docs.live.eureka.tools.sap/how-to/admi...,1,2378,505,# Admission Webhook Best Practices and Warning...
efcacb67ddb42bd11166dfcad68b92ec7261e2d7e107a6e04bf5b16e3e532a5f,ede-20230605,general,admission-webhook.md,./dataset/ede-20230605/ede_md/admission-webhoo...,https://docs.live.eureka.tools.sap/how-to/admi...,2,2519,506,## Availability\n\nIt is recommended that Admi...


In [7]:
df_meta.describe()

Unnamed: 0,chapter,text_length,token_count
count,778.0,778.0,778.0
mean,3.330334,1480.877892,366.167095
std,3.309847,881.059088,195.656537
min,1.0,13.0,4.0
25%,1.0,963.0,241.0
50%,2.0,1395.0,354.5
75%,4.0,1808.0,455.75
max,27.0,8136.0,1071.0


In [8]:
df_meta['token_count'].sum()

284878

In [9]:
df_meta['text_length'].sum()

1152123

In [10]:
df_meta.index.value_counts()

idx
5321df80aa9ea2d59e5f8f24b7d47018c0e0ea05a84b28287ed13096577ffc1f    1
354cf2e85c818225608bd25ac77605d3f682355c2a3e8e672b4f93abdf786024    1
a2d18eff7babe27f2e231416db57d5f130f625f1020aa5d4b7403b32b1380db8    1
9f10076479ff20b6025e1c8c74fd3aa2dcf10aa78790eb945548b6b706683d64    1
b6ede053e10ebd7a94cbde11307021275f92d6659d83b993b657fccb191a2771    1
                                                                   ..
4b3a3181e7d8c352b1b2fdb287086463beeb8822e7b96beee106d48993f34886    1
b8687f9c2aecb9b65ab4f3d594d105ea0fb8d291a21461cd99a570dd0650db54    1
7d71f8b0d45198ee406413b2e8984ed940f8a85b3671d2049bff934875be7843    1
ac143e4e8c4b8dbd37c8fde7edf1de8f2ef0f76a8597f3387422d62e7375fb55    1
713f6cdf257f88c6b3c3a669bb3968583e6497b5cc4e58f1043ed5586b853e55    1
Name: count, Length: 778, dtype: int64

In [11]:
print(df_meta[df_meta['token_count'] > 1000].shape)
df_meta[df_meta['token_count'] > 1000]

(15, 9)


Unnamed: 0_level_0,library,category,file_name,file_path,external_link,chapter,text_length,token_count,content
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
94e9753c0e6f0c2573fc4f0b9b149a309bee6c64030851c3a2aee77d0f84d8a4,ede-20230605,general,register-kubernetes-clusters-to-bom.md,./dataset/ede-20230605/ede_md/register-kuberne...,https://docs.live.eureka.tools.sap/getting-sta...,2,4444,1016,## Procedure Registering Your Clusters to BoM...
3aa7ffce57df6e83efea0a7836c6293a13bf0dd4f5b2e140d2f659c4aeff4c48,ede-20230605,general,whats-new.md,./dataset/ede-20230605/ede_md/whats-new.md,https://docs.live.eureka.tools.sap/whats-new,1,5286,1023,# What's New This topic lists the new featur...
3e05c4932ce2476f961f2a5488c7d2c4f2d10248dbb25b6606b0620123d8a5fe,ede-20230605,general,whats-new.md,./dataset/ede-20230605/ede_md/whats-new.md,https://docs.live.eureka.tools.sap/whats-new,2,5218,1011,| 2022-12-13 | Developer |...
fc0bdeb9f46c25717ecaffcc7e5cff7cb75028d1e6f317db4eef2f834f12c85e,ede-20230605,general,whats-new.md,./dataset/ede-20230605/ede_md/whats-new.md,https://docs.live.eureka.tools.sap/whats-new,3,7783,1025,Developers can consume non-built-in datastores...
630ba6bf1bc0e69cbe1ff2eaa669e2c0ce57519e890816e99b437c95c22754b4,ede-20230605,general,whats-new.md,./dataset/ede-20230605/ede_md/whats-new.md,https://docs.live.eureka.tools.sap/whats-new,4,8136,1024,point.com/:v:/r/teams/S4HANALabs-Eureka/Shared...
6cccf0d3826bfb16bddf54288a453213557a7fd146a447aadba24b8cb8a7f16f,ede-20230605,development,ICE DE Development.md,./dataset/ede-20230605/ICE DE Development_md/I...,https://wiki.one.int.sap/wiki/display/eureka/i...,1,3338,1014,* [ICE DE Service Directory](/wiki/display/Eur...
2f11cac9d864ad583cbb4202a895d1d04d69c91a27b9731aff9d53805e08b5d9,ede-20230605,development,ICE DE Service Directory.md,./dataset/ede-20230605/ICE DE Development_md/I...,https://wiki.one.int.sap/wiki/display/eureka/i...,1,3148,1032,Category| Summary| Purpose/Component| Team| Ma...
0cc08483703a8fc4c0da0a36a59a5d73e620cf47c2a876b163ab59c550739d95,ede-20230605,development,ICE DE Service Directory.md,./dataset/ede-20230605/ICE DE Development_md/I...,https://wiki.one.int.sap/wiki/display/eureka/i...,2,2969,1003,| Meta| [ailsa.wu@sap.com](mailto:ailsa.wu@sap...
270f159cd9e99fa7867970a6eeb5d3da9c0868adadc2b1e98f718532ae12c1bd,ede-20230605,development,"Tools, Monitoring, Runbook.md",./dataset/ede-20230605/ICE DE Development_md/T...,https://wiki.one.int.sap/wiki/display/eureka/t...,2,3864,1008,#### Consumer Lag dashboard Browse → daas →[ ...
9b3e43b54a27f5f95cd87b9108183182f5c9a5ae08f7b9e294b6ba45b028e3ec,ede-20230605,development,"Tools, Monitoring, Runbook.md",./dataset/ede-20230605/ICE DE Development_md/T...,https://wiki.one.int.sap/wiki/display/eureka/t...,4,2641,1012,### Loki Queries The following queries will d...


In [12]:
print(df_meta[df_meta['token_count'] < 30].shape)
df_meta[df_meta['token_count'] < 30]

(9, 9)


Unnamed: 0_level_0,library,category,file_name,file_path,external_link,chapter,text_length,token_count,content
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
48de358c309644289629136837f61df32b92757695d1e3533235ac02576287fa,ede-20230605,general,bom-runbook.md,./dataset/ede-20230605/ede_md/bom-runbook.md,https://docs.live.eureka.tools.sap/runbook/bom...,1,103,24,# BoM Runbook\n\nThe BoM is a tool for managin...
0a9adfbabc0a07641a9a635eee2de1473537d7f7494bd3c48e108b0f09718de0,ede-20230605,general,daas-runbook.md,./dataset/ede-20230605/ede_md/daas-runbook.md,https://docs.live.eureka.tools.sap/runbook/daa...,5,75,24,## More Information\n- [Working with DaaS 1.0]...
f7a4c306a8d9ce1c5fe287bd33351c5653d9e8a9639c3de1a174676e1d0204c3,ede-20230605,general,day-one-2-backend-developers.md,./dataset/ede-20230605/ede_md/day-one-2-backen...,https://docs.live.eureka.tools.sap/getting-sta...,6,123,28,## More Information\n\nFor more information ab...
c7fcfa937d6d7800c00c0c5821110361d5ab2c1b7bb2212da355056ecfc9a787,ede-20230605,general,offboarding.md,./dataset/ede-20230605/ede_md/offboarding.md,https://docs.live.eureka.tools.sap/getting-sta...,1,14,4,# Offboarding\n
ed5257770fd50e72bcf322fae46250403a379f8fd246f074ba5d7b0375d37200,ede-20230605,general,work-with-pact.md,./dataset/ede-20230605/ede_md/work-with-pact.md,https://docs.live.eureka.tools.sap/how-to/work...,1,20,5,# Working with Pact\n
342a366ae926ca214b6e8202eae0589b866efaf60f7ca71dfe15a6be8aac46a6,ede-20230605,development,ICE DE Service Directory.md,./dataset/ede-20230605/ICE DE Development_md/I...,https://wiki.one.int.sap/wiki/display/eureka/i...,3,76,22,Note: DI services can be found [here](/wiki/di...
b555ea515a49ad89f61b613c838cd156c918b665ae6d01bf002520c5ee6c9132,ede-20230605,development,Knative Overview.md,./dataset/ede-20230605/ICE DE Development_md/K...,https://wiki.one.int.sap/wiki/display/eureka/k...,1,107,22,\n\nKnative is an Open-Source Enterprise-lev...
16b6f55865955c86c3bbf0606693b5f1bc5bb03ed8e819d82fd7fce19e3be913,ede-20230605,development,Monitoring.md,./dataset/ede-20230605/ICE DE Development_md/M...,https://wiki.one.int.sap/wiki/display/eureka/m...,2,23,5,## Helpful Dashboards\n\n
e615f8177417ac4a2990f4b0f31209514ee6f6e7765438c5f8a2ee4a6f0e67c7,ede-20230605,development,Sealedsecret (3rd-party).md,./dataset/ede-20230605/ICE DE Development_md/S...,https://wiki.one.int.sap/wiki/pages/viewpage.a...,3,13,8,## 4.3. How\n\n


In [13]:
df_meta = df_meta[df_meta['token_count'] > 20]

In [14]:
df_meta.index.value_counts()

idx
5321df80aa9ea2d59e5f8f24b7d47018c0e0ea05a84b28287ed13096577ffc1f    1
3a2ed2f3dcc6c1f50293872e2f870a934a07ebe6472a4bc44b5b37466ebedacb    1
a2d18eff7babe27f2e231416db57d5f130f625f1020aa5d4b7403b32b1380db8    1
9f10076479ff20b6025e1c8c74fd3aa2dcf10aa78790eb945548b6b706683d64    1
b6ede053e10ebd7a94cbde11307021275f92d6659d83b993b657fccb191a2771    1
                                                                   ..
c02c7df13d6a7978bf70aaf513a3a40677998d064c53fb6f6556b35339ab2ac4    1
4b3a3181e7d8c352b1b2fdb287086463beeb8822e7b96beee106d48993f34886    1
b8687f9c2aecb9b65ab4f3d594d105ea0fb8d291a21461cd99a570dd0650db54    1
7d71f8b0d45198ee406413b2e8984ed940f8a85b3671d2049bff934875be7843    1
713f6cdf257f88c6b3c3a669bb3968583e6497b5cc4e58f1043ed5586b853e55    1
Name: count, Length: 774, dtype: int64

In [15]:
df_meta.token_count.describe()

count     774.000000
mean      368.031008
std       194.429837
min        22.000000
25%       242.250000
50%       355.500000
75%       456.000000
max      1071.000000
Name: token_count, dtype: float64

In [16]:
df_meta.to_parquet(f'./output/meta.parquet', index='idx')