In [4]:
import pymupdf4llm
import re
import json

In [None]:
text = pymupdf4llm.to_markdown("ug_rules.pdf")

In [None]:
# clean up UG rules
header = '**Undergraduate Programme Rules** Courses of Study 2024-2025'
lines = [l for l in text.split('\n') if l.strip() != '' and l.strip() != header]
lines = lines[58:] # skip the contents and intro part

# find and split at headers
header_regex = r"^\*\*(\.?\d+)+"
sections = [(i, re.match(header_regex, l).group(0)[2:]) for i, l in enumerate(lines) if re.match(header_regex, l)]
sections

[(0, '1'),
 (1, '1.1'),
 (2, '1.1.1'),
 (19, '1.1.2'),
 (29, '1.1.3'),
 (36, '1.2'),
 (37, '1.2.1'),
 (51, '1.2.2'),
 (62, '1.2.3'),
 (110, '1.2.4'),
 (123, '1.3'),
 (206, '1.4'),
 (230, '1.5'),
 (262, '1.6'),
 (312, '1.7'),
 (399, '1.8'),
 (430, '1.9'),
 (473, '1.10'),
 (494, '1.11'),
 (505, '1.12'),
 (510, '1.13'),
 (514, '1.14'),
 (1964, '3.1'),
 (2008, '3.2'),
 (2026, '3.3'),
 (2033, '3.4'),
 (2166, '3.5'),
 (2220, '3.6'),
 (2253, '3.6.1'),
 (2276, '3.6.2'),
 (2277, '3.6.2.1'),
 (2286, '3.6.2.2'),
 (2301, '3.6.2.3'),
 (2313, '3.6.2.4'),
 (2326, '3.6.2.5'),
 (2346, '3.6.2.6'),
 (2376, '3.6.3'),
 (2407, '3.6.3.1'),
 (2500, '3.7')]

In [None]:
# create a nested dict of sections
parsed = {}
for i, (idx, sec) in enumerate(sections):
    data = {'section': sec, 'title': lines[idx], 'content': ''}
    if i + 1 < len(sections):
        next_idx = sections[i + 1][0]
        data['content'] = '\n'.join(lines[(idx+1):next_idx])
    else:
        data['content'] = '\n'.join(lines[(idx+1):])
    parsed[sec] = data

39

In [25]:
# create a reusable modular function from all above code
def parse_pdf_rules_to_dict(text, header_text, start_line=0):
    # clean up doc
    lines = [l.strip() for l in text.split('\n') if l.strip() != '' and l.strip() != header_text]
    lines = lines[start_line:] # skip the contents and intro part
    
    # find and split at headers
    header_regex = r"\#* ?\*\*((\.?\d+)+).*"
    
    sections = [(i, re.match(header_regex, l).group(1)) for i, l in enumerate(lines) if re.match(header_regex, l)]

    # create a nested dict of sections
    parsed = {}
    for i, (idx, sec) in enumerate(sections):
        data = {'title': lines[idx], 'content': '', 'subsections': []}
        if i + 1 < len(sections):
            next_idx = sections[i + 1][0]
            data['content'] = '\n'.join(lines[(idx+1):next_idx])
        else:
            data['content'] = '\n'.join(lines[(idx+1):])
        # if this is a subsection, link to parent
        if '.' in sec:
            split = sec.split('.')
            for j in range(1, len(split)):
                parent_sec = '.'.join(split[:j])
                if parent_sec in parsed:
                    parsed[parent_sec]['subsections'].append(sec)
                    break
            parent_sec = sec.rsplit('.', 1)[0]
        parsed[sec] = data
    return parsed

In [23]:
sources = [
    'cos_24_rules.pdf',
    'pg_rules.pdf',
    'ug_rules.pdf'
]
for source in sources:
    print('Processing', source)
    text = pymupdf4llm.to_markdown(source)
    lines = [l.strip() for l in text.split('\n') if l.strip() != '']
    with open('markdown/' + source.replace('.pdf', '.md'), 'w+') as f:
        f.write('\n'.join(lines))

Processing cos_24_rules.pdf
Processing pg_rules.pdf
Processing ug_rules.pdf


In [26]:
starter_lines = [66, 47, 59]
headers = ['Courses of Study 2024-2025 **common rules**', '**Undergraduate Programme Rules** Courses of Study 2024-2025', '**Postgraduate Programme Rules** Courses of Study 2024-2025']
for source, start, header in zip(sources, starter_lines, headers):
    print('Processing', source)
    text = ''
    with open('markdown/' + source.replace('.pdf', '.md'), 'r') as f:
        text = f.read()
    
    processed = parse_pdf_rules_to_dict(text, header, start)
    with open('processed/' + source.replace('.pdf', '.json'), 'w+') as f:
        json.dump(processed, f, indent=4)

Processing cos_24_rules.pdf
Processing pg_rules.pdf
Processing ug_rules.pdf


In [29]:
# converts processed json files to a jsonl file
files = [
    'processed/cos_24_rules.json',
    'processed/ug_rules.json',
    'processed/pg_rules.json'
]
urls = [
    'https://home.iitd.ac.in/uploads/General%20Information/CoS%202024__General%20Rules%20(1).pdf',
    'https://home.iitd.ac.in/uploads/PG%20Programme%20Rules/CoS%202024__PG%20Programme%20Rules%20Changed.pdf',
    'https://home.iitd.ac.in/uploads/UG%20Programme%20Rules/CoS%202024__UG%20Programme%20Rules__changed.pdf'
]
for fname, url in zip(files, urls):
    with open(f'jsonl/{fname.split("/")[-1].replace(".json", ".jsonl")}', 'w+') as out_f:
        with open(fname, 'r') as in_f:
            data = json.load(in_f)
            for k, v in data.items():
                record = {
                    'section': v['title'].replace('*', '').replace('#', ''),
                    'content': v['content'],
                    'url': url,
                    'file': fname.split('/')[-1]
                }
                out_f.write(json.dumps(record) + '\n')

In [None]:
# convert courses.json to jsonl
with open('jsonl/courses.jsonl', 'w+') as out_f:
    with open('processed/courses.json', 'r') as in_f:
        data = json.load(in_f)
        for k, v in data.items():
            record = {
                'section': k,
                'content': json.dumps(v),
                'url': 'https://home.iitd.ac.in/uploads/Course%20Descriptions/CoS%202024__Course%20Descriptions%20Changed.pdf',
                'file': 'courses_of_study_2024.json'
            }
            out_f.write(json.dumps(record) + '\n')

In [9]:
# parse courses_offered
import pandas as pd
df = pd.read_csv('courses_offered.csv')
# df.to_json('processed/courses_offered.json', orient='records', indent=4)
df.columns = ['year', 'semester', 'course_code', 'credits', 'hours_lecture', 'hours_tutorial', 'hours_practical', 'slot', 'name', 'instructor']
df.to_json('jsonl/courses_offered.jsonl', orient='records', lines=True)