In [1]:
import polars as pl
import os
import re
from datetime import datetime
from AI_models import YogobellaMLLMix
YogobellaMLLMix = YogobellaMLLMix()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pl.read_csv("report\data\RTVSlo\PrometnoPorocilo2022.csv", encoding="ANSI")
def save_sample(sample_size=100):
    sample = df.sample(sample_size, with_replacement=False)
    sample.write_csv("report\data\RTVSlo\PrometnoPorocilo2022_sample.csv")
save_sample(100)

In [None]:
def inspect(df, sample=100):
    print(df.head())
    columns = df.columns  
    for index, row in enumerate(df.iter_rows()):
        if index < sample:  
            for col_idx, column in enumerate(columns):
                print(f"Row {index}, Column {column}: {row[col_idx]}")
        else:
            break

In [None]:
def print_important_news(df, columns= ["TitlePomembnoSLO", "ContentPomembnoSLO"]):

    important_news_df = df.select(columns)
    important_news_df = important_news_df.unique(subset="ContentPomembnoSLO")
    for index, row in enumerate(important_news_df.iter_rows()):
        if row[1] is None:
            continue
        print(f"Title: {row[0]}")
        print(f"Content: {row[1]}")
        print("-" * 80)
print_important_news(df, columns= ["TitlePomembnoSLO", "ContentPomembnoSLO"])

In [None]:
def process_rtf_files(self):
    """
    Process all RTF files in the Podatki - rtvslo.si folder, extract date, time and content,
    and save the information to a joined CSV file.
    """
    all_data = []
    base_path = "report/data/RTVSlo/Podatki - rtvslo.si"
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.rtf'):
                file_path = os.path.join(root, file)
                folder_name = os.path.basename(os.path.dirname(file_path))
                
                try:
                    # Read file with ANSI encoding
                    with open(file_path, 'r', encoding='ANSI') as f:
                        content = f.read()
                        
                        # Extract date and time using regex
                        date_time_match = re.search(r'(\d{1,2}\.\s+\d{1,2}\.\s+\d{4})\s+(\d{1,2}\.\d{2})', content)
                        if date_time_match:
                            date_str = date_time_match.group(1)
                            time_str = date_time_match.group(2)
                            
                            # Convert date to standard format
                            date_obj = datetime.strptime(date_str, '%d. %m. %Y')
                            formatted_date = date_obj.strftime('%d. %m. %Y')
                            
                            # Extract content sections
                            content_sections = []
                            # Look for content after "Podatki o prometu." or similar headers
                            content_start = re.search(r'Podatki o prometu\.', content)
                            if content_start:
                                remaining_content = content[content_start.end():]
                                # Split content into sections (assuming they're separated by \par)
                                sections = re.split(r'\\par\s*\\par', remaining_content)
                                for section in sections:
                                    # Clean up the content while preserving special characters
                                    clean_section = section
                                    # Remove RTF commands but keep special character codes
                                    clean_section = re.sub(r'\\[a-zA-Z0-9]+(?![0-9])', '', clean_section)
                                    # Remove RTF groups
                                    clean_section = re.sub(r'\{.*?\}', '', clean_section)
                                    # Convert RTF special character codes to actual characters
                                    clean_section = re.sub(r'\\\'([0-9a-fA-F]{2})', 
                                                            lambda m: chr(int(m.group(1), 16)), 
                                                            clean_section)
                                    clean_section = clean_section.strip()
                                    if clean_section:
                                        content_sections.append(clean_section)
                            
                            # Create row data
                            row_data = {
                                'Datum': formatted_date,
                                'ura': time_str,
                                'TMP_file_name': file,
                                'TMP_folder_name': folder_name
                            }
                            
                            # Add content sections
                            for i, section in enumerate(content_sections, 1):
                                row_data[f'content_{i:02d}'] = section
                            
                            all_data.append(row_data)
                            
                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")
    
    # Convert to DataFrame
    df = pl.DataFrame(all_data)
    
    # Save to CSV with proper encoding
    output_path = "report/data/RTVSlo/Joined_rtf_files.csv"
    df.write_csv(output_path)
    print(f"Processed {len(all_data)} RTF files and saved to {output_path}")
    return df

In [15]:
import os
print("CWD:", os.getcwd())


CWD: c:\Users\turkf\Pictures\mag\ONJ\ONJyfans\ul-fri-nlp-course-project-2024-2025-onjyfans\report\code


In [45]:
#input_df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022_sample.csv", encoding="ANSI")
input_df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022.csv", encoding="ANSI")
output_df = pl.read_csv("../data/RTVSlo/Joined_rtf_files.csv", encoding="ANSI")

In [None]:
print(df.select("Datum").head(5))


In [48]:


def find_match(date, input_df, output_df):
    """
    Find and match data from PrometnoPorocilo_2022 to the appropriate RTF file based on date.
    
    Args:
        date (str): Date in format 'YYYY-MM-DD' to match
        
    Returns:
        tuple: (rtf_content, matching_data) where rtf_content is the content of the RTF file
                and matching_data is the corresponding row from the DataFrame
    """
    input_df = input_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))
    output_df = output_df.with_columns(pl.col("Datum").str.strptime(pl.Date, "%d. %m. %Y"))

    print(f"Searching for date: {date}")
    date = datetime.strptime(date, "%d.%m.%Y").date()
    matching_input_rows = input_df.filter(pl.col("Datum").dt.date() == date)
    matching_output_rows = output_df.filter(pl.col('Datum') == date)

    #sort by time 
    matching_input_rows = matching_input_rows.sort("Datum")
    matching_output_rows = matching_output_rows.sort("ura")

    """
    if matching_input_rows.is_empty():
        print(f"No matching data found in input for date: {date}")
    if matching_output_rows.is_empty():
        print(f"No matching data found in output for date: {date}")
        
    print(f"Found {matching_input_rows.shape[0]} matching rows in input and {matching_output_rows.shape[0]} in output for date: {date}")
    print(f"Input DataFrame:\n{matching_input_rows}")
    print(f"Output DataFrame:\n{matching_output_rows}")"""

    for row in matching_input_rows.to_dicts():
        print("-----------------------")
        print("At:", row["Datum"].time())
        print("POMEMBNO:", row["ContentPomembnoSLO"])
        print("NESREČE", row["ContentNesreceSLO"])
        print("ZASTOJI", row["ContentZastojiSLO"])
        print("VREME", row["ContentVremeSLO"])
        print("OVIRE", row["ContentOvireSLO"])
        print("DELO NA CESTI", row["ContentDeloNaCestiSLO"])
        print("OPOMBA", row["ContentOpozorilaSLO"])
        print("MEDNARODNE INFORMACIJE", row["ContentMednarodneInformacijeSLO"])
        print("SPLOŠNE", row["ContentSplosnoSLO"])
        print("-----------------------")

    print("On Date : ", date)
    for row in matching_output_rows.to_dicts():
        print("-----------------------")
        print("At : ", row["ura"])
        for col in ["content_01", "content_02", "content_03", "content_04", "content_05"]:
            if row[col] is not None:
                print(f"{col}: {row[col]}")
        print("-----------------------")

#"%Y-%m-%d"
find_match("21.1.2022", input_df, output_df)

Searching for date: 21.1.2022
-----------------------
At: 02:07:00
POMEMBNO: None
NESREČE None
ZASTOJI None
VREME <p>Megla v pasovih ponekod zmanjšuje vidljivost.</p>
OVIRE None
DELO NA CESTI <p>Cesta Ilirska Bistrica - Podgrad je zaprta pri odcepu za Podbeže do 24. januarja. Obvoz za vozila do 7,5 t in avtobuse je po cesti Podgrad - Obrov - Pregarje - Harije. Za vozila nad 7,5 t pa po primorski avtocesti. </p>
OPOMBA None
MEDNARODNE INFORMACIJE None
SPLOŠNE None
-----------------------
-----------------------
At: 03:04:00
POMEMBNO: None
NESREČE None
ZASTOJI None
VREME <p>Megla v pasovih ponekod zmanjšuje vidljivost.</p>
OVIRE None
DELO NA CESTI <p>Cesta Ilirska Bistrica - Podgrad je zaprta pri odcepu za Podbeže do 24. januarja. Obvoz za vozila do 7,5 t in avtobuse je po cesti Podgrad - Obrov - Pregarje - Harije. Za vozila nad 7,5 t pa po primorski avtocesti. </p>
OPOMBA None
MEDNARODNE INFORMACIJE None
SPLOŠNE None
-----------------------
-----------------------
At: 04:05:00
POMEMBNO: