In [1]:
import re

import docx
import openpyxl

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
doc = docx.Document("./Urnik zimski semester 2019-20 Ljubljana.docx")

In [4]:
def get_day(table):
    return table.rows[1].cells[0].text.replace("\n", "")

In [5]:
def get_time_ranges(table):
    return [cell.text for cell in table.rows[0].cells[2:]]

In [6]:
def get_data_from_cell(cell):
    RE_PARAN = re.compile(r'(?<=\().*?(?=\))')
    
    text = cell.text
    
    if not text:
        return None
    
    text_array = text.splitlines()
    
    if len(text_array) == 1 or len(text_array) > 4:
        return None
    
    rest_of_array = " ".join(text_array[1:])
    
    teachers_string = rest_of_array.split("(")[0]
    teachers_string = teachers_string.split("/")
    teachers = teachers_string[0].split(",")
    if len(teachers_string) == 2:
        teachers += ["as. " + name for name in teachers_string[1].split(",")]
    teachers = [name.strip() for name in teachers]
    
    paranthesis_values = re.findall(RE_PARAN, rest_of_array)
    classroom = paranthesis_values[0] if len(paranthesis_values) == 1 else paranthesis_values[1]
    time = paranthesis_values[0] if len(paranthesis_values) == 2 else None
    
    data_from_cell = {
        "subject": text_array[0],
        "teachers": teachers,
        "classroom": classroom,
    }
    
    if time:
        data_from_cell["time"] = time
    
    return data_from_cell
        

In [7]:
def process_table(table):
    day = get_day(table)
    time_range = get_time_ranges(table)
    
    rows = table.rows[1:]
    study_course = ''
    
    processed_table = []
    teachers = []
    
    for i, row in enumerate(rows):
        for index, cell in enumerate(row.cells):
            if index == 0:
                continue
            elif index == 1:
                study_course = cell.text
                continue
            else:
                data = get_data_from_cell(cell)
                if not data:
                    continue
                time = data.get("time", time_range[index-2])
                processed_table.append([study_course, data["subject"], day, time, data["classroom"], " ,".join(data["teachers"])])
                for teacher in data["teachers"]:
                    teachers.append([study_course, data["subject"], day, time,data["classroom"], teacher])
    return {
        "processed": processed_table,
        "teachers": teachers
    }

In [19]:
data = []
teachers = []

for table in doc.tables:
    processed = process_table(table)
    data += processed["processed"]
    teachers += processed["teachers"]

In [20]:
wb_teachers = openpyxl.Workbook()
    
teachers_names = set([teacher[-1] for teacher in teachers])

for teacher in teachers_names:
    teachers_sheet = wb_teachers.create_sheet(title=teacher)
    teachers_sheet.append([teacher])
    teachers_sheet.append([])
    for row in teachers:
        if teacher == row[-1]:
            teachers_sheet.append(row)

teachers_sheet = wb_teachers.create_sheet(title="Učitelji_vsi_predmeti")
teachers_sheet.append(["Smer", "Predmet", "Dan", "Ura", "Predavalnica", "Profesor"])
for row in teachers:
    teachers_sheet.append(row)


wb_teachers.save(filename="URNIK–ZIMSKI SEMESTER 2019-2020 – LJUBLJANA (predavatelji).xlsx")

In [23]:
wb_groups = openpyxl.Workbook()

study_groups =  set([group[0] for group in data])

for group in study_groups:
    study_groups_sheet = wb_groups.create_sheet(title=group)
    study_groups_sheet.append([group])
    study_groups_sheet.append([])
    for row in data:
        if group == row[0]:
            study_groups_sheet.append([])

wb_groups.save(filename="URNIK–ZIMSKI SEMESTER 2019-2020 – LJUBLJANA (smeri in letniki).xlsx")

In [29]:
wb_rooms = openpyxl.Workbook()

classrooms =  set([room[4] for room in data])

for room in classrooms:
    name = "Predavalnica " + room.replace("/", "-")[:5]
    classrooms_sheet = wb_rooms.create_sheet(title=name)
    classrooms_sheet.append([room])
    classrooms_sheet.append([])
    for row in data:
        if room == row[4]:
            classrooms_sheet.append(row)
    
wb_rooms.save(filename="URNIK–ZIMSKI SEMESTER 2019-2020 – LJUBLJANA (predavalnice).xlsx")

In [26]:
wb_days = openpyxl.Workbook()

days =  set([day[2] for day in data])

for day in days:
    classrooms_sheet = wb_days.create_sheet(title=day)
    classrooms_sheet.append([day])
    classrooms_sheet.append([])
    for row in data:
        if day == row[2]:
            classrooms_sheet.append(row)

wb_days.save(filename="URNIK–ZIMSKI SEMESTER 2019-2020 – LJUBLJANA (dnevi).xlsx")