## Parsiranje rasporeda

### Ulazni fajlovi:
- ```Ponedeljak.xls```
- ```Utorak.xls```
- ```Sreda.xls```
- ```Cetvrtak.xls```
- ```Petak.xls```
- ```Subota.xls```

### Dostupni sheet-ovi ulaznih fajlova

![Raspored sheet-ovi](../docs/raspored_sheetovi.png)

### Primer sheet-a ulaznog fajla

![Primer rasporeda](../docs/raspored_primer.png)


### Klasa ```RasporedTermin```

Sadrži informacije o terminima koji se održavaju po najnovijem rasporedu nastave
- trajanje termina
- naziv predmeta
- ime, prezime i titula svih predavača
- oznaka semestra
- oznake studentskih grupa
- tip nastave (predavanje, auditorne, računarske, laboratorijske vežbe)
- oznaka katedre
- ukupan broj studenata
- šifra struke

In [10]:
from model_parser import RasporedTermin, ReadWrite

In [11]:
# kreiranje liste objekata klase RasporedTermin na osnovu parsiranog reda
def create_termin_list(
        parsed_row: list
) -> list[RasporedTermin]:
    attr_names = ['sifraStruke', 'semestar', 'predmet', 'tipNastave', 'studGrupa', 'predavac', 'oznakaKatedre', 'ukupnoStud']
    termin_num = len(parsed_row[0])
    termini = []

    for i in range(termin_num):
        termin = RasporedTermin()
        for j, attr in enumerate(attr_names):
            setattr(termin, attr, parsed_row[j][i][0])
        termin.trajanje = parsed_row[0][0][1]
        termini.append(termin)
    return termini

## Pretprocesiranje tekstualnog sadržaja

In [12]:
from parser_utils import remove_new_line, remove_extra_whitespace, title_to_lowercase

In [13]:
def preprocess(text):
    # 1) remove all new line and tab characters
    text = remove_new_line(text)
    # 2) remove additional whitespaces
    text = remove_extra_whitespace(text)
    # 3) title to lower
    text = title_to_lowercase(text)
    return text

### Parser

In [14]:
import numpy as np
import pandas as pd

In [15]:
# read each sheet given in list from excel file
# skip the 'Ucionica' column
# read all values as string
def read_sheets(
        file_name: str,
        dir_path: str = '../data/', 
        prostorija_column: str = 'Učionica',
        sheet_list: list[str] = ['Sifra struke', 'Semest', 'Naziv pred.', 'Vrsta nastave', 'Br grupe', 'Nastavnik', 'Katedra', 'BrStud']
)-> list[pd.DataFrame]:
    file_path = dir_path + file_name + '.xls'
    
    sheets = []
    for sheet_name in sheet_list:
        sheet = pd.read_excel(file_path, sheet_name=sheet_name, usecols=lambda x: prostorija_column not in x, dtype=str)
        sheets.append(sheet)
    
    return sheets

In [16]:
def sheet_row_parser(
        row: np.ndarray
):
    termini = []
    count = 0
    current = np.nan

    i = 0
    while i < len(row):
        x = row[i]
        reset = yield termini

        # breaks
        # break termin (reset counter and add parsed termin)
        # break was found in earlier sheet so break normaln for following
        if reset == 1:
            termini.append((current, count))
            current = x
            count = 0
        # hard break termin (reset counter and add parsed termin)
        # because break was found in later sheet
        if reset == -1:
            termini.append((current, count-1))
            current = x
            count = 1
            continue

        # cheks if changes occur
        # nan found
        if not isinstance(x, str):
            # if previous was not nan, break termin
            if isinstance(current, str):
                termini.append((current, count))
            # if previous was nan, continue
            count = 0
            current = np.nan
            i += 1
            continue
        # value changed
        if x != current:
            # if previous value was not nan, break termin
            if isinstance(current, str):
                termini.append((current, count))
            # if previous was nan, start counting
            count = 0
            current = x
        # no change, increase counter
        if current == x:
            count += 1
        
        # iterate forward
        i += 1

In [17]:
# controlles sheet parser communication
# if one sheets breaks termin, others are notified
def sheet_controller(
        sheets_row: list[np.ndarray]
) -> list[list[tuple]]:
    # create sheet_row_parser for each sheet row
    generators = [sheet_row_parser(row) for row in sheets_row]
    sheet_num = len(sheets_row)
    parsed_rows = [{} for i in range(sheet_num)]

    # invoke each generator
    for gen in generators:
        next(gen)
    
    count = 0
    reset = 0
    try:
        while True:
            if reset == 1:
                # find maximum number of termini found in all sheets
                max_length = max(len(x) for x in parsed_rows)
                reset = -1
                # for each sheet row parser generator
                for idx, gen in enumerate(generators):
                    # send reset if sheet row termin should be split
                    # eg. [P1, P1, P1], [G1, G1, G2]
                    # Predmet 1 has 2 15-min slots for Group 1, then a 15-min slot for Group 2
                    # Group generator will break termin and signal will be sent to Predmet generator
                    # Because Predmet is before Group generator, a hard reset has to be sent (reset backward)
                    if max_length > len(parsed_rows[idx]):
                        parsed_rows[idx] = gen.send(reset)
            reset = 0
            for idx, gen in enumerate(generators):
                # parse forward
                parsed_rows[idx] = gen.send(reset)
                # check if new termin found
                if len(parsed_rows[idx]) > count:
                    # update termin count
                    count = len(parsed_rows[idx])
                    # set reset to break termin in following generators
                    # (break termin in gen 3 of 8 -> send reset to 4,5,6,7,8)
                    reset = 1
    except StopIteration:
        # print("sheet row finished")
        pass
    
    return parsed_rows

In [18]:
def file_controller(
        file_name: str,
        dir_path: str = '../data/', 
        prostorija_column: str = 'Učionica',
        sheet_list: list[str] = ['Sifra struke', 'Semest', 'Naziv pred.', 'Vrsta nastave', 'Br grupe', 'Nastavnik', 'Katedra', 'BrStud']
) -> list[RasporedTermin]:
    # reads all sheets for specific file_name (Ponedeljak, Utorak,...)
    sheets = read_sheets(file_name, dir_path, prostorija_column, sheet_list)

    prostorije_raspored = []
    row_num = len(sheets[0].index)

    # combine each coresponding row from every sheet into a list
    for i in range(row_num):
        sheets_row = []
        for sheet in sheets:
            row = sheet.iloc[i].values
            sheets_row.append(row)

        # parse the rows
        parsed_row = sheet_controller(sheets_row)

        # create RasporedTermin list for row
        raspored_red_list = create_termin_list(parsed_row)
        prostorije_raspored += raspored_red_list
    
    return prostorije_raspored

### Parsiranje svih rasporeda

In [19]:
def parse_data(
        file_names: list[str] = ['Ponedeljak', 'Utorak', 'Sreda', 'Cetvrtak', 'Petak', 'Subota']
):
    data = []
    for file_name in file_names:
        data += file_controller(file_name)
    return data

### Sređivanje podataka (pretprocesiranje)

In [20]:
def data_preprocess(
        data: list[RasporedTermin]
) -> list[RasporedTermin]:
    for termin in data:
        termin.predmet = preprocess(termin.predmet)
        termin.predavac = preprocess(termin.predavac)
        termin.studGrupa = preprocess(termin.studGrupa)
    return data

### Izvršavanje

In [21]:
data = parse_data()
data = data_preprocess(data)
ReadWrite.write_to_file(data, '3_raspored')