In [1]:
import polars as pl
import os
import re
from datetime import datetime
from LLMs import LLM
from striprtf.striprtf import rtf_to_text

Lets generate sample of Prometno poročilo so we can look at it. 

In [69]:
df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022.csv", encoding="ANSI")

def save_sample(sample_size=100):
    sample = df.sample(sample_size, with_replacement=False)
    sample.write_csv("../data/RTVSlo/PrometnoPorocilo2022_sample.csv")
save_sample(100)

Lets gather all the RTF files in a joined csv file.

In [81]:
class RTF_Data_Processor():
    def __init__(self, df):
        self.df = df

    def process_rtf_files(self):
        """
        Process all RTF files in the Podatki - rtvslo.si folder, extract date, time and content,
        and save the information to a joined CSV file.
        """
        all_data = []
        base_path = "../data/RTVSlo/Podatki - rtvslo.si"
        for root, dirs, files in os.walk(base_path):
            for file in files:
                if file.endswith('.rtf'):
                    file_path = os.path.join(root, file)
                    folder_name = os.path.basename(os.path.dirname(file_path))
                    try:
                        print(f"{file_path}...")
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()
                            content = rtf_to_text(content)
                            
                            date_time_match = re.search(r'(\d{1,2}\.\s+\d{1,2}\.\s+\d{4})\s+(\d{1,2}\.\d{2})', content)
                            if date_time_match:
                                date_str = date_time_match.group(1)
                                time_str = date_time_match.group(2)
                                
                                date_obj = datetime.strptime(f"{date_str} {time_str}", '%d. %m. %Y %H.%M')
                                formatted_date = date_obj.strftime('%m/%d/%Y %H:%M')
                                
                                content_sections = []
                                content_start = re.search(r'Podatki o prometu\.', content)
                                if content_start:
                                    remaining_content = content[content_start.end():]
                                    sections = [s.strip() for s in remaining_content.split('\n\n') if s.strip()]
                                    content_sections = sections
                                
                                row_data = {
                                    'Datum': formatted_date,
                                    'TMP_file_name': file,
                                    'TMP_folder_name': folder_name
                                }
                                
                                for i, section in enumerate(content_sections, 1):
                                    row_data[f'content_{i:02d}'] = section
                                
                                all_data.append(row_data)
                                
                    except Exception as e:
                        print(f"Error processing file {file_path}: {str(e)}")
        
        df = pl.DataFrame(all_data)
        output_path = "../data/RTVSlo/Joined_rtf_files.csv"
        df.write_csv(output_path)
        print(f"Processed {len(all_data)} RTF files and saved to {output_path}")
        return df
    
RTF_Data_Processor = RTF_Data_Processor(df)
RTF_Data_Processor.process_rtf_files()

../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-1.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-10.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-100.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-101.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-102.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-103.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-104.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-105.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-106.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-107.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-108.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-109.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 2022\TMP-11.rtf...
../data/RTVSlo/Podatki - rtvslo.si\Promet 2022\April 20

Datum,TMP_file_name,TMP_folder_name,content_01,content_02,content_03,content_04,content_05
str,str,str,str,str,str,str,str
"""04/30/2022 18:30""","""TMP-1.rtf""","""April 2022""","""Zaradi prometne nesreče je zap…","""Na mejnem prehodu Obrežje vozn…",,,
"""04/30/2022 13:00""","""TMP-10.rtf""","""April 2022""","""Zaradi pokvarjenega vozila je …","""Na mejnih prehodih Sečovlje, P…","""�""",,
"""04/27/2022 06:30""","""TMP-100.rtf""","""April 2022""","""Zaradi prometne nesreče je na …","""Na mariborski vzhodni obvoznic…",,,
"""04/27/2022 06:00""","""TMP-101.rtf""","""April 2022""","""Zaradi prometne nesreče je na …","""Od 8-ih do 22-ih bo veljala om…",,,
"""04/26/2022 20:00""","""TMP-102.rtf""","""April 2022""","""Na južni ljubljanski obvoznici…","""Zaradi del je na severni ljubl…","""Na cesti Rogatec - Dobovec pot…","""Na Obrežju je povečana promet …","""�"""
…,…,…,…,…,…,…,…
"""09/26/2024 18:30""","""TMP9-2024-96.rtf""","""September 2024""","""Zaradi del na štajerski avtoce…","""Ponoči bo na gorenjski avtoces…","""Cesta Celje-Krško bo zaradi de…",,
"""09/26/2024 18:00""","""TMP9-2024-97.rtf""","""September 2024""","""Na štajerski avtocesti je zara…","""Zaradi del poteka promet na do…",,,
"""09/26/2024 17:30""","""TMP9-2024-98.rtf""","""September 2024""","""Na štajerski avtocesti je zara…",,,,
"""09/26/2024 17:00""","""TMP9-2024-99.rtf""","""September 2024""","""Na južni ljubljanski obvoznici…","""Na štajerski avtocesti je zara…",,,


In [95]:
sample_input_df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022_sample.csv", encoding="ANSI")
input_df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022.csv", encoding="cp1250")
output_df = pl.read_csv("../data/RTVSlo/Joined_rtf_files.csv", encoding="utf-8")

In [96]:
def find_match(date, input_df, output_df):
    """
    Find and match data from PrometnoPorocilo_2022 to the appropriate RTF file based on date.
    
    Args:
        date (str): Date in format 'YYYY-MM-DD' to match
        
    Returns:
        tuple: (rtf_content, matching_data) where rtf_content is the content of the RTF file
                and matching_data is the corresponding row from the DataFrame
    """
    input_df = input_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))
    output_df = output_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))

    print(f"Searching for date: {date}")
    date = datetime.strptime(date, "%d.%m.%Y").date()
    matching_input_rows = input_df.filter(pl.col("Datum").dt.date() == date)
    matching_output_rows = output_df.filter(pl.col("Datum").dt.date() == date)
    matching_input_rows = matching_input_rows.sort("Datum")
    matching_output_rows = matching_output_rows.sort("Datum")

    """
    if matching_input_rows.is_empty():
        print(f"No matching data found in input for date: {date}")
    if matching_output_rows.is_empty():
        print(f"No matching data found in output for date: {date}")
        
    print(f"Found {matching_input_rows.shape[0]} matching rows in input and {matching_output_rows.shape[0]} in output for date: {date}")
    print(f"Input DataFrame:\n{matching_input_rows}")
    print(f"Output DataFrame:\n{matching_output_rows}")"""

    for row in matching_input_rows.to_dicts():
        print("-----------------------")
        print("At:", row["Datum"].time())
        print("POMEMBNO:", row["ContentPomembnoSLO"])
        print("NESREČE", row["ContentNesreceSLO"])
        print("ZASTOJI", row["ContentZastojiSLO"])
        print("VREME", row["ContentVremeSLO"])
        print("OVIRE", row["ContentOvireSLO"])
        print("DELO NA CESTI", row["ContentDeloNaCestiSLO"])
        print("OPOZORILA", row["ContentOpozorilaSLO"])
        print("MEDNARODNE INFORMACIJE", row["ContentMednarodneInformacijeSLO"])
        print("SPLOŠNE", row["ContentSplosnoSLO"])
        print("-----------------------")

    print("On Date : ", date)
    for row in matching_output_rows.to_dicts():
        print("-----------------------")
        print("At:", row["Datum"].time())
        for col in ["content_01", "content_02", "content_03", "content_04", "content_05"]:
            if row[col] is not None:
                print(f"{col}: {row[col]}")
        print("-----------------------")

#"%Y-%m-%d"
find_match("30.1.2022", input_df, output_df)

Searching for date: 30.1.2022
-----------------------
At: 00:35:00
POMEMBNO: None
NESREČE None
ZASTOJI None
VREME None
OVIRE None
DELO NA CESTI <p>Regionalna cesta Javornik - Gorje bo zaradi rekonstrukcije ceste skozi naselje Gorje zaprta do 30. aprila. Obvoz je urejen na relaciji Bled - Lesce - Žirovnica - Javornik - Lipce in obratno.</p>
OPOZORILA None
MEDNARODNE INFORMACIJE None
SPLOŠNE None
-----------------------
-----------------------
At: 00:45:00
POMEMBNO: None
NESREČE None
ZASTOJI None
VREME None
OVIRE None
DELO NA CESTI <p>Regionalna cesta Javornik - Gorje bo zaradi rekonstrukcije ceste skozi naselje Gorje zaprta do 30. aprila. Obvoz je urejen na relaciji Bled - Lesce - Žirovnica - Javornik - Lipce in obratno.</p>
OPOZORILA None
MEDNARODNE INFORMACIJE None
SPLOŠNE None
-----------------------
-----------------------
At: 00:45:00
POMEMBNO: None
NESREČE None
ZASTOJI None
VREME None
OVIRE None
DELO NA CESTI <p>Regionalna cesta Javornik - Gorje bo zaradi rekonstrukcije ceste skoz

Lets try to display it side by side.

In [104]:
from IPython.display import display, HTML

def find_match(date, input_df, output_df):
    input_df = input_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))
    output_df = output_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))

    print(f"Searching for date: {date}")
    date = datetime.strptime(date, "%d.%m.%Y").date()
    matching_input_rows = input_df.filter(pl.col("Datum").dt.date() == date).sort("Datum")
    matching_output_rows = output_df.filter(pl.col("Datum").dt.date() == date).sort("Datum")
    
    # Generate HTML blocks
    left_html = "<h3>Input Data</h3>"
    for row in matching_input_rows.to_dicts():
        left_html += f"""
        <div style="margin-bottom: 10px; border-bottom: 1px solid #ccc;">
            <strong>At:</strong> {row["Datum"].time()}<br>
            <strong>A1:</strong> {row["A1"]}<br>
            <strong>B1:</strong> {row["B1"]}<br>
            <strong>C1:</strong> {row["C1"]}<br>
            <strong>A1:</strong> {row["A2"]}<br>
            <strong>B1:</strong> {row["B2"]}<br>
            <strong>C1:</strong> {row["C2"]}<br>
            <strong>POMEMBNO:</strong> {row["ContentPomembnoSLO"]}<br>
            <strong>NESREČE:</strong> {row["ContentNesreceSLO"]}<br>
            <strong>ZASTOJI:</strong> {row["ContentZastojiSLO"]}<br>
            <strong>VREME:</strong> {row["ContentVremeSLO"]}<br>
            <strong>OVIRE:</strong> {row["ContentOvireSLO"]}<br>
            <strong>DELO NA CESTI:</strong> {row["ContentDeloNaCestiSLO"]}<br>
            <strong>OPOZORILA:</strong> {row["ContentOpozorilaSLO"]}<br>
            <strong>MEDNARODNE INFORMACIJE:</strong> {row["ContentMednarodneInformacijeSLO"]}<br>
            <strong>SPLOŠNE:</strong> {row["ContentSplosnoSLO"]}
        </div>
        """

    right_html = "<h3>Output Data</h3>"
    for row in matching_output_rows.to_dicts():
        row_html = f"<div style='margin-bottom: 10px; border-bottom: 1px solid #ccc;'><strong>At:</strong> {row["Datum"].time()}<br>"
        for col in ["content_01", "content_02", "content_03", "content_04", "content_05"]:
            if row[col] is not None:
                row_html += f"<strong>{col}:</strong> {row[col]}<br>"
        row_html += "</div>"
        right_html += row_html

    # Display side-by-side
    html_output = f"""
    <div style="display: flex; gap: 40px;">
        <div style="width: 50%; height: 400px; overflow-y: scroll;  padding: 10px;">{left_html}</div>
        <div style="width: 50%; height: 400px; overflow-y: scroll;  padding: 10px;">{right_html}</div>
    </div>  
    """

    display(HTML(html_output))

# Example usage
find_match("25.6.2022", input_df, output_df)

Searching for date: 25.6.2022


In [98]:
def print_df_for_date(date, input_df):
    date = datetime.strptime(date, "%d.%m.%Y").date()
    input_df = input_df.with_columns(
        pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M")
    )
    filtered_df = input_df.filter(pl.col("Datum").dt.date() == date).sort("Datum")
    print(f"Filtered DataFrame for date {date}:")
    filtered_df.write_csv("PrometnoPorocilo_custom_date.csv")

print_df_for_date("9.2.2022", input_df)

Filtered DataFrame for date 2022-02-09:


NOTE:

Poročila izhajajo od 5:30 do 20:00 vsake pol ure.

Poročila so razdeljena po pomembnosti. 

        A1
        B1
        C1
        A2 (eng)
        B2 (eng)
        C2 (eng)


When insepcting further the content 01-05 seems a bit mixed up and confusing. The input data is also missing information on traffic jams, I suspect the operators read of the map at promet.si. 

Since there are many possible html title tags included in the A1,B1,C1.. we can try to analize them. How often do they occur in the specified bracket. This can help us determine the importance of the news.

In [None]:
bracket_columns = ['A1', 'B1', 'C1', 'A2', 'B2', 'C2']

def extract_html_tags(text):
    if text is None:
        return []
    tags = re.findall(r'<[^>]+>', text)
    return tags

print("Analysis of HTML tags in bracket columns:")
for col in bracket_columns:
    if col in df.columns:

        non_null_values = df.filter(pl.col(col).is_not_null()).select(col)
        all_tags = []
        for text in non_null_values.to_series():
            tags = extract_html_tags(text)
            all_tags.extend(tags)
        
        tag_counts = {}
        for tag in all_tags:
            tag_counts[tag] = tag_counts.get(tag, 0) + 1
        
        print(f"\nColumn {col}:")
        print(f"Total number of entries: {len(non_null_values)}")
        print(f"Number of entries with HTML tags: {len(all_tags)}")
        print("Most common HTML tags:")
        for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"  {tag}: {count} occurrences")
        
        if len(non_null_values) > 0:
            percentage = (len(all_tags) / len(non_null_values)) * 100
            print(f"Percentage of entries with HTML tags: {percentage:.2f}%")
        
        print("\nSample entries with HTML tags:")
        sample_entries = df.sample(fraction=1 , seed=42) 
        sample_entries = sample_entries.filter(pl.col(col).str.contains(r'<[^>]+>')).select(col).head(50)
        for entry in sample_entries.to_series():
            print(f"{entry}") 

Analysis of HTML tags in bracket columns:

Column A1:
Total number of entries: 55001
Number of entries with HTML tags: 52344
Most common HTML tags:
  <strong>: 13523 occurrences
  </strong>: 13523 occurrences
  <p>: 12071 occurrences
  </p>: 12071 occurrences
  <br>: 1016 occurrences
Percentage of entries with HTML tags: 95.17%

Sample entries with HTML tags:
<p><strong>Pozor!</strong></p><p><strong>Zaradi gore?ega vozila je zaprta </strong><strong>gorenjska</strong><strong> avtocesta med Brezjem in predorom Ljubno proti Ljubljani. Obvoz po vzporedni regionalni cesti med priklju?koma </strong><strong>Brezje</strong><strong> in </strong><strong>Pod</strong><strong>tabor. </strong></p>
<p><strong>Pozor!</strong></p><p><strong>Zaradi gore?ega vozila je zaprta </strong><strong>gorenjska</strong><strong> avtocesta med Brezjem in predorom Ljubno proti Ljubljani. Obvoz po vzporedni regionalni cesti med priklju?koma </strong><strong>Brezje</strong><strong> in </strong><strong>Pod</strong><stro

In [None]:
results = []
def extract_tag_text(text):
    if text is None:
        return []
    matches = re.findall(r'<strong>(.*?)</strong>', text)
    return matches

for col in bracket_columns:
    if col in df.columns:
        non_null_values = df.filter(pl.col(col).is_not_null()).select(col)
        tag_counts = {}
        for text in non_null_values.to_series():
            tag_texts = extract_tag_text(text)
            for tag_text in tag_texts:
                clean_text = ' '.join(tag_text.split()).lower()
                tag_counts[clean_text] = tag_counts.get(clean_text, 0) + 1
        
        for tag_text, count in tag_counts.items():
            results.append({
                'bracket': col,
                'tag_value': tag_text,
                'count': count
            })

results_df = pl.DataFrame(results)

for bracket in bracket_columns:
    results_df = pl.DataFrame(results)
    results_df = results_df.sort(['bracket', 'count'], descending=[False, True])
    results_df = results_df.filter(pl.col('bracket') == bracket)
    output_path = "../data/RTVSlo/tag_analysis_results.csv"
    results_df.write_csv(output_path)
    print(f"Results saved to {output_path}")
    print("\nSample of the results:")
    print(results_df.head(10))

Results saved to ../data/RTVSlo/tag_analysis_results.csv

Sample of the results:
shape: (10, 3)
┌─────────┬─────────────────────────────────┬───────┐
│ bracket ┆ tag_value                       ┆ count │
│ ---     ┆ ---                             ┆ ---   │
│ str     ┆ str                             ┆ i64   │
╞═════════╪═════════════════════════════════╪═══════╡
│ A1      ┆ pozor!                          ┆ 5516  │
│ A1      ┆                                 ┆ 342   │
│ A1      ┆ .                               ┆ 275   │
│ A1      ┆ zaprta                          ┆ 232   │
│ A1      ┆ zaradi tehni?nih težav je prom… ┆ 208   │
│ A1      ┆ zaradi tehni?nih težav je prom… ┆ 200   │
│ A1      ┆ gorenjski avtocesti med lescam… ┆ 169   │
│ A1      ┆ ljubljani                       ┆ 124   │
│ A1      ┆ zaprt                           ┆ 117   │
│ A1      ┆ na gorenjski avtocesti je       ┆ 115   │
└─────────┴─────────────────────────────────┴───────┘
Results saved to ../data/RTVSlo/tag_anal

First, collect and preprocess your RTF files:
Analyze existing reports to understand patterns:
Prepare training data and train the model:
Generate new reports

In [None]:
from traffic_parser import TrafficReportParser

parser = TrafficReportParser()

# Example traffic report
report = """Na štajerski avtocesti med Domžalami in Krtino proti Mariboru je zaradi 
prometne nesreče zaprt vozni pas. Nastaja zastoj dolg 2,5 km. 
Obvoz je možen po regionalni cesti."""

# Parse the report
event = parser.parse_report(report, "ACC-2024-001")

# The event object will contain structured data following our schema
if event:
    print(event.json(indent=2))

In [None]:
event = TrafficEvent(
    id="ACC-2024-001",
    timestamp=datetime.now(),
    event_type=EventType.ACCIDENT_WITH_JAM,
    priority=3,
    road_section=RoadSection(
        road_type=RoadType.MOTORWAY,
        road_name="ŠTAJERSKA AVTOCESTA",
        direction=Direction(
            from_location="LJUBLJANA",
            to_location="MARIBOR",
            section="med priključkoma Domžale in Krtina"
        )
    ),
    reason="prometna nesreča",
    consequence="zaprt vozni pas",
    lanes_affected=1,
    detour_available=True,
    detour_description="Obvoz je po vzporedni regionalni cesti"
)