In [1]:
!pip install python-docx
!pip install striprtf

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m253.0/253.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting striprtf
  Downloading striprtf-0.0.29-py3-none-any.whl.metadata (2.3 kB)
Downloading striprtf-0.0.29-py3-none-any.whl (7.9 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.29


In [15]:
import re
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
import docx
from google.colab import files
import io
from striprtf.striprtf import rtf_to_text

@dataclass
class Vacancy:
    title: Optional[str] = None
    city: Optional[str] = None
    employment_type: Optional[str] = None
    work_schedule: Optional[str] = None
    responsibilities: List[str] = None
    requirements: List[str] = None
    advantages: List[str] = None  # –ù–æ–≤–æ–µ –ø–æ–ª–µ –¥–ª—è "–ë—É–¥–µ—Ç –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ–º"
    education_level: Optional[str] = None
    experience_required: Optional[str] = None

    def __post_init__(self):
        if self.responsibilities is None:
            self.responsibilities = []
        if self.requirements is None:
            self.requirements = []
        if self.advantages is None:
            self.advantages = []

class AdvancedVacancyParser:
    """–£–ª—É—á—à–µ–Ω–Ω—ã–π –ø–∞—Ä—Å–µ—Ä –≤–∞–∫–∞–Ω—Å–∏–π —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π —Å–ª–æ–∂–Ω—ã—Ö —Å—Ç—Ä—É–∫—Ç—É—Ä"""

    def parse_file(self, file_content: bytes, filename: str) -> Vacancy:
        """–ü–∞—Ä—Å–∏–Ω–≥ —Ñ–∞–π–ª–∞ –≤–∞–∫–∞–Ω—Å–∏–∏"""
        try:
            if filename.lower().endswith('.docx'):
                return self._parse_docx_vacancy(file_content)
            elif filename.lower().endswith('.rtf'):
                return self._parse_rtf_vacancy(file_content)
            else:
                raise ValueError(f"–ù–µ–ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–π —Ñ–æ—Ä–º–∞—Ç: {filename}")

        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ –≤–∞–∫–∞–Ω—Å–∏–∏: {e}")
            return Vacancy()

    def _parse_docx_vacancy(self, file_content: bytes) -> Vacancy:
        """–ü–∞—Ä—Å–∏–Ω–≥ DOCX –≤–∞–∫–∞–Ω—Å–∏–∏"""
        doc = docx.Document(io.BytesIO(file_content))
        vacancy_data = {}

        # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Ç–∞–±–ª–∏—Ü—ã
        for table in doc.tables:
            for row in table.rows:
                if len(row.cells) >= 2:
                    field_name = row.cells[0].text.strip()
                    field_value = row.cells[1].text.strip()
                    if field_name and field_value:
                        vacancy_data[field_name] = field_value

        # –ü–æ–ª—É—á–∞–µ–º –ø–æ–ª–Ω—ã–π —Ç–µ–∫—Å—Ç –¥–ª—è —Å–ª–æ–∂–Ω–æ–≥–æ –ø–∞—Ä—Å–∏–Ω–≥–∞
        full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

        return self._parse_vacancy_data(vacancy_data, full_text)

    def _parse_rtf_vacancy(self, file_content: bytes) -> Vacancy:
        """–ü–∞—Ä—Å–∏–Ω–≥ RTF –≤–∞–∫–∞–Ω—Å–∏–∏"""
        rtf_text = file_content.decode('utf-8', errors='ignore')
        text = rtf_to_text(rtf_text)
        vacancy_data = {}

        # –ü–∞—Ä—Å–∏–º RTF —Ç–µ–∫—Å—Ç
        lines = text.split('\n')
        for line in lines:
            line = line.strip()
            if '|' in line:  # –¢–∞–±–ª–∏—á–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç
                parts = [p.strip() for p in line.split('|')]
                if len(parts) >= 2:
                    vacancy_data[parts[0]] = '|'.join(parts[1:])
            elif ':' in line and len(line.split(':')) == 2:  # –ö–ª—é—á: –∑–Ω–∞—á–µ–Ω–∏–µ
                key, value = [p.strip() for p in line.split(':', 1)]
                vacancy_data[key] = value

        return self._parse_vacancy_data(vacancy_data, text)

    def _parse_vacancy_data(self, vacancy_data: Dict[str, str], full_text: str) -> Vacancy:
        """–ü–∞—Ä—Å–∏–Ω–≥ –¥–∞–Ω–Ω—ã—Ö –≤–∞–∫–∞–Ω—Å–∏–∏ —Å —É–ª—É—á—à–µ–Ω–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–æ–π"""
        vacancy = Vacancy()

        # –û—Å–Ω–æ–≤–Ω—ã–µ –ø–æ–ª—è
        vacancy.title = self._get_field_value(vacancy_data, '–ù–∞–∑–≤–∞–Ω–∏–µ')
        vacancy.city = self._get_field_value(vacancy_data, '–ì–æ—Ä–æ–¥')
        vacancy.employment_type = self._get_field_value(vacancy_data, '–¢–∏–ø –∑–∞–Ω—è—Ç–æ—Å—Ç–∏')
        vacancy.work_schedule = self._get_field_value(vacancy_data, '–¢–µ–∫—Å—Ç –≥—Ä–∞—Ñ–∏–∫ —Ä–∞–±–æ—Ç—ã')
        vacancy.education_level = self._get_field_value(vacancy_data, '–£—Ä–æ–≤–µ–Ω—å –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è')
        vacancy.experience_required = self._get_field_value(vacancy_data, '–¢—Ä–µ–±—É–µ–º—ã–π –æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã')

        # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏ –∏ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è —Å —É–ª—É—á—à–µ–Ω–Ω—ã–º –ø–∞—Ä—Å–∏–Ω–≥–æ–º
        responsibilities_text = self._get_field_value(vacancy_data, '–û–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏ (–¥–ª—è –ø—É–±–ª–∏–∫–∞—Ü–∏–∏)')
        requirements_text = self._get_field_value(vacancy_data, '–¢—Ä–µ–±–æ–≤–∞–Ω–∏—è (–¥–ª—è –ø—É–±–ª–∏–∫–∞—Ü–∏–∏)')

        # –£–ª—É—á—à–µ–Ω–Ω—ã–π –ø–∞—Ä—Å–∏–Ω–≥ —Å —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ–º –Ω–∞ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è –∏ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–∞
        if requirements_text:
            requirements, advantages = self._parse_requirements_with_advantages(requirements_text)
            vacancy.requirements = requirements
            vacancy.advantages = advantages
        else:
            # –ò—â–µ–º –≤ –ø–æ–ª–Ω–æ–º —Ç–µ–∫—Å—Ç–µ
            req_text = self._find_section(full_text, ['–¢—Ä–µ–±–æ–≤–∞–Ω–∏—è'])
            if req_text:
                requirements, advantages = self._parse_requirements_with_advantages(req_text)
                vacancy.requirements = requirements
                vacancy.advantages = advantages

        # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏
        if responsibilities_text:
            vacancy.responsibilities = self._parse_advanced_list(responsibilities_text)
        else:
            resp_text = self._find_section(full_text, ['–û–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏'])
            if resp_text:
                vacancy.responsibilities = self._parse_advanced_list(resp_text)

        return vacancy

    def _parse_requirements_with_advantages(self, text: str) -> Tuple[List[str], List[str]]:
        """–†–∞–∑–¥–µ–ª—è–µ—Ç —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è –Ω–∞ –æ—Å–Ω–æ–≤–Ω—ã–µ –∏ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–∞"""
        requirements = []
        advantages = []

        # –†–∞–∑–¥–µ–ª—è–µ–º —Ç–µ–∫—Å—Ç –Ω–∞ —á–∞—Å—Ç–∏ –¥–æ –∏ –ø–æ—Å–ª–µ "–ë—É–¥–µ—Ç –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ–º"
        advantage_keywords = ['–±—É–¥–µ—Ç –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ–º', '–ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ–º –±—É–¥–µ—Ç', 'considered an advantage', '–±—É–¥–µ—Ç –ø–ª—é—Å–æ–º', '–ø–ª—é—Å–æ–º –±—É–¥–µ—Ç']

        main_text = text
        advantage_text = ""

        for keyword in advantage_keywords:
            if keyword in text.lower():
                parts = re.split(keyword, text, flags=re.IGNORECASE)
                if len(parts) >= 2:
                    main_text = parts[0]
                    advantage_text = parts[1]
                    break

        # –ü–∞—Ä—Å–∏–º –æ—Å–Ω–æ–≤–Ω—ã–µ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è
        requirements = self._parse_advanced_list(main_text)

        # –ü–∞—Ä—Å–∏–º –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–∞
        if advantage_text:
            advantages = self._parse_advanced_list(advantage_text)

        return requirements, advantages

    def _parse_advanced_list(self, text: str) -> List[str]:
        """–£–ª—É—á—à–µ–Ω–Ω—ã–π –ø–∞—Ä—Å–∏–Ω–≥ —Å–ø–∏—Å–∫–æ–≤ —Å —Ä–∞–∑–ª–∏—á–Ω—ã–º–∏ —Ñ–æ—Ä–º–∞—Ç–∞–º–∏"""
        if not text:
            return []

        items = []
        current_item = ""

        # –†–∞–∑–¥–µ–ª—è–µ–º —Ç–µ–∫—Å—Ç –Ω–∞ —Å—Ç—Ä–æ–∫–∏
        lines = text.split('\n')

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –Ω–∞—á–∞–ª–æ –Ω–æ–≤–æ–≥–æ –ø—É–Ω–∫—Ç–∞
            is_new_item = any(line.startswith(prefix) for prefix in ['-', '‚Ä¢', '‚Äî', '*', '‚úì', '‚Üí']) or \
                         re.match(r'^\d+[\.\)]', line) or \
                         re.match(r'^[a-z][\)\.]', line, re.IGNORECASE)

            if is_new_item and current_item:
                # –°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–µ–¥—ã–¥—É—â–∏–π –ø—É–Ω–∫—Ç
                cleaned_item = self._clean_list_item(current_item)
                if cleaned_item:
                    items.append(cleaned_item)
                current_item = line
            else:
                # –ü—Ä–æ–¥–æ–ª–∂–∞–µ–º —Ç–µ–∫—É—â–∏–π –ø—É–Ω–∫—Ç
                if current_item:
                    current_item += " " + line
                else:
                    current_item = line

        # –î–æ–±–∞–≤–ª—è–µ–º –ø–æ—Å–ª–µ–¥–Ω–∏–π –ø—É–Ω–∫—Ç
        if current_item:
            cleaned_item = self._clean_list_item(current_item)
            if cleaned_item:
                items.append(cleaned_item)

        # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞: —Ä–∞–∑–¥–µ–ª—è–µ–º –ø—É–Ω–∫—Ç—ã, –∫–æ—Ç–æ—Ä—ã–µ –º–æ–≥–ª–∏ —Å–ª–∏—Ç—å—Å—è
        final_items = []
        for item in items:
            # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –Ω–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –ø—É–Ω–∫—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø–æ–¥–ø—É–Ω–∫—Ç–æ–≤
            if ';' in item and len(item) > 50:  # –î–ª–∏–Ω–Ω—ã–π –ø—É–Ω–∫—Ç —Å —Ç–æ—á–∫–∞–º–∏ —Å –∑–∞–ø—è—Ç–æ–π
                sub_items = item.split(';')
                for sub_item in sub_items:
                    cleaned_sub = sub_item.strip()
                    if cleaned_sub and len(cleaned_sub) > 3:
                        final_items.append(cleaned_sub)
            else:
                final_items.append(item)

        return [item for item in final_items if item and len(item) > 3]

    def _clean_list_item(self, item: str) -> str:
        """–û—á–∏—Å—Ç–∫–∞ –ø—É–Ω–∫—Ç–∞ —Å–ø–∏—Å–∫–∞ –æ—Ç –º–∞—Ä–∫–µ—Ä–æ–≤"""
        # –£–±–∏—Ä–∞–µ–º —Ä–∞–∑–ª–∏—á–Ω—ã–µ –º–∞—Ä–∫–µ—Ä—ã —Å–ø–∏—Å–∫–∞
        patterns = [
            r'^[‚Ä¢\-‚Äî*\s]+',
            r'^\d+[\.\)]\s*',
            r'^[a-z][\)\.]\s*',
            r'^[‚úì‚Üí‚ñ∂]\s*'
        ]

        for pattern in patterns:
            item = re.sub(pattern, '', item, flags=re.IGNORECASE)

        item = item.strip()

        # –£–±–∏—Ä–∞–µ–º –ª–∏—à–Ω–∏–µ –¥–µ—Ñ–∏—Å—ã –≤ –Ω–∞—á–∞–ª–µ –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏
        item = re.sub(r'^-\s*', '', item)

        return item

    def _get_field_value(self, data: Dict[str, str], field_name: str) -> Optional[str]:
        """–ü–æ–ª—É—á–µ–Ω–∏–µ –∑–Ω–∞—á–µ–Ω–∏—è –ø–æ–ª—è"""
        if field_name in data:
            return data[field_name]

        # –ü–æ–∏—Å–∫ –ø–æ —á–∞—Å—Ç–∏—á–Ω–æ–º—É —Å–æ–≤–ø–∞–¥–µ–Ω–∏—é
        for key in data.keys():
            if field_name.lower() in key.lower():
                return data[key]

        return None

    def _find_section(self, text: str, section_names: List[str]) -> Optional[str]:
        """–ü–æ–∏—Å–∫ —Å–µ–∫—Ü–∏–∏ –ø–æ –Ω–∞–∑–≤–∞–Ω–∏—é"""
        for name in section_names:
            pattern = rf'{name}.*?(?=\n\s*[–ê-–ØA-Z]|\n\n|$)'
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                # –£–±–∏—Ä–∞–µ–º –Ω–∞–∑–≤–∞–Ω–∏–µ —Å–µ–∫—Ü–∏–∏ –∏–∑ —Ç–µ–∫—Å—Ç–∞
                section_text = match.group(0)
                return re.sub(rf'^{name}[\s:\-]*', '', section_text, flags=re.IGNORECASE).strip()
        return None

# –û–±–Ω–æ–≤–ª–µ–Ω–Ω—ã–π DocumentProcessor
class DocumentProcessor:
    def __init__(self):
        self.vacancy_parser = AdvancedVacancyParser()
        self.uploaded_files = {}

    def upload_files(self):
        print("–ó–∞–≥—Ä—É–∑–∏—Ç–µ —Ñ–∞–π–ª—ã –≤–∞–∫–∞–Ω—Å–∏–π (DOCX –∏–ª–∏ RTF):")
        uploaded = files.upload()
        self.uploaded_files = uploaded
        return uploaded
        
    def process_vacancy(self, file_name: str) -> Vacancy:
        if file_name in self.uploaded_files:
            return self.vacancy_parser.parse_file(self.uploaded_files[file_name], file_name)
        else:
            raise ValueError(f"–§–∞–π–ª {file_name} –Ω–µ –Ω–∞–π–¥–µ–Ω")

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞
processor = DocumentProcessor()

In [8]:
pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9
Note: you may need to restart the kernel to use updated packages.


In [23]:
import re
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import docx
from google.colab import files
import io
import os
from docx2txt import process as rtf_process

@dataclass
class Candidate:
    experience_total: Optional[str] = None
    work_experience: List[Dict[str, Any]] = None
    education: List[Dict[str, Any]] = None
    skills: List[str] = None
    languages: List[Dict[str, Any]] = None
    positions: List[str] = None

    def __post_init__(self):
        if self.work_experience is None:
            self.work_experience = []
        if self.education is None:
            self.education = []
        if self.skills is None:
            self.skills = []
        if self.languages is None:
            self.languages = []
        if self.positions is None:
            self.positions = []

class ResumeParser:
    """–ü–∞—Ä—Å–µ—Ä –¥–ª—è —Ä–µ–∑—é–º–µ"""

    def parse_file(self, file_content: bytes, filename: str) -> Candidate:
        """–ü–∞—Ä—Å–∏–Ω–≥ —Ñ–∞–π–ª–∞ —Ä–µ–∑—é–º–µ"""
        try:
            if filename.lower().endswith('.docx'):
                text = self._extract_text_from_docx(file_content)
            elif filename.lower().endswith('.rtf'):
                text = self._extract_text_from_rtf(file_content)
            else:
                raise ValueError(f"–ù–µ–ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–π —Ñ–æ—Ä–º–∞—Ç: {filename}")

            return self.parse_text(text)

        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ —Ä–µ–∑—é–º–µ: {e}")
            return Candidate()

    def _extract_text_from_docx(self, file_content: bytes) -> str:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –∏–∑ DOCX"""
        doc = docx.Document(io.BytesIO(file_content))
        text_parts = []

        for para in doc.paragraphs:
            if para.text.strip():
                text_parts.append(para.text)

        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    if cell.text.strip():
                        row_text.append(cell.text.strip())
                if row_text:
                    text_parts.append(" | ".join(row_text))

        return "\n".join(text_parts)

    def _extract_text_from_rtf(self, file_content: bytes) -> str:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –∏–∑ RTF —Å —É–ª—É—á—à–µ–Ω–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–æ–π"""
        try:
            # –î–µ–∫–æ–¥–∏—Ä—É–µ–º –±–∞–π—Ç—ã –≤ —Å—Ç—Ä–æ–∫—É
            rtf_text = file_content.decode('utf-8', errors='ignore')

            # –£–ø—Ä–æ—â–µ–Ω–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ RTF
            text = self._simple_rtf_to_text(rtf_text)
            return text

        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ RTF: {e}")
            # –ê–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–Ω—ã–π –º–µ—Ç–æ–¥: –ø–æ–ø—Ä–æ–±—É–µ–º –ø—Ä–æ—Å—Ç–æ —É–¥–∞–ª–∏—Ç—å RTF —Ç–µ–≥–∏
            return self._fallback_rtf_processing(file_content)

    def _simple_rtf_to_text(self, rtf_text: str) -> str:
        """–£–ø—Ä–æ—â–µ–Ω–Ω—ã–π –∫–æ–Ω–≤–µ—Ä—Ç–µ—Ä RTF –≤ —Ç–µ–∫—Å—Ç"""
        # –£–¥–∞–ª—è–µ–º RTF –∑–∞–≥–æ–ª–æ–≤–æ–∫
        text = re.sub(r'\\[a-zA-Z]+\d*', ' ', rtf_text)
        text = re.sub(r'\{.*?\}', ' ', text)
        text = re.sub(r'\\[{}]', '', text)

        # –ó–∞–º–µ–Ω—è–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏
        text = text.replace(r'\par', '\n')
        text = text.replace(r'\line', '\n')
        text = text.replace(r'\tab', '\t')
        text = text.replace(r'\emdash', '‚Äî')
        text = text.replace(r'\endash', '‚Äì')

        # –£–¥–∞–ª—è–µ–º –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –∏ –ø–µ—Ä–µ–Ω–æ—Å—ã
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n\s+', '\n', text)

        return text.strip()

    def _fallback_rtf_processing(self, file_content: bytes) -> str:
        """–ê–≤–∞—Ä–∏–π–Ω—ã–π –º–µ—Ç–æ–¥ –æ–±—Ä–∞–±–æ—Ç–∫–∏ RTF"""
        try:
            # –ü—Ä–æ—Å—Ç–æ –ø—ã—Ç–∞–µ–º—Å—è –¥–µ–∫–æ–¥–∏—Ä–æ–≤–∞—Ç—å –∫–∞–∫ —Ç–µ–∫—Å—Ç, –∏–≥–Ω–æ—Ä–∏—Ä—É—è RTF —Ä–∞–∑–º–µ—Ç–∫—É
            text = file_content.decode('utf-8', errors='ignore')
            # –£–¥–∞–ª—è–µ–º —è–≤–Ω—ã–µ RTF —Ç–µ–≥–∏
            text = re.sub(r'\\[a-zA-Z]+\d*', ' ', text)
            text = re.sub(r'[{}]', ' ', text)
            text = re.sub(r'\s+', ' ', text)
            return text
        except:
            return "–ù–µ —É–¥–∞–ª–æ—Å—å –∏–∑–≤–ª–µ—á—å —Ç–µ–∫—Å—Ç –∏–∑ RTF —Ñ–∞–π–ª–∞"

    def parse_text(self, text: str) -> Candidate:
        """–ü–∞—Ä—Å–∏–Ω–≥ —Ç–µ–∫—Å—Ç–∞ —Ä–µ–∑—é–º–µ"""
        candidate = Candidate()
        text = self._normalize_text(text)

        candidate.experience_total = self._extract_experience(text)
        candidate.skills = self._extract_skills(text)
        candidate.languages = self._extract_languages(text)
        candidate.positions = self._extract_positions(text)

        return candidate

    def _normalize_text(self, text: str) -> str:
        """–ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞"""
        text = re.sub(r'[‚Äì‚Äî‚àí‚Äê]', '-', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def _extract_experience(self, text: str) -> Optional[str]:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –æ–ø—ã—Ç–∞ —Ä–∞–±–æ—Ç—ã"""
        patterns = [
            r'–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã[\s:\-]*([^\n]+?)(?=\n|$)',
            r'–°—Ç–∞–∂[\s:\-]*([^\n]+?)(?=\n|$)',
            r'Experience[\s:\-]*([^\n]+?)(?=\n|$)',
        ]

        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                experience = match.group(1).strip()
                experience = re.sub(r'[|\-]\s*', '', experience)
                return experience
        return None

    def _extract_skills(self, text: str) -> List[str]:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –Ω–∞–≤—ã–∫–æ–≤"""
        skills_section = self._find_section(text, ['–ù–∞–≤—ã–∫–∏', 'Skills'])
        if not skills_section:
            return []

        skills_section = re.sub(r'^(–ù–∞–≤—ã–∫–∏|Skills)[\s:\-]*', '', skills_section, flags=re.IGNORECASE)
        skills = re.findall(r'\b[A-Z–ê-–Ø][A-Za-z–ê-–Ø–∞-—è0-9\s\/\+\.\-]+\b', skills_section)

        filtered_skills = []
        for skill in skills:
            skill = skill.strip()
            if (2 < len(skill) < 50 and
                not any(x in skill.lower() for x in ['–Ω–∞–≤—ã–∫–∏', 'skills', '—è–∑—ã–∫–∏', 'languages']) and
                not re.match(r'^[0-9\s\-]+$', skill)):
                filtered_skills.append(skill)

        return list(set(filtered_skills))

    def _extract_languages(self, text: str) -> List[Dict[str, Any]]:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —è–∑—ã–∫–æ–≤"""
        languages_section = self._find_section(text, ['–Ø–∑—ã–∫–∏', 'Languages'])
        if not languages_section:
            return []

        languages = []
        patterns = [
            r'([–ê-–Ø–∞-—èA-Za-z]+)[\s\-:]+([–ê-–Ø–∞-—èA-Za-z0-9\s\-]+)',
            r'([–ê-–Ø–∞-—èA-Za-z]+)[\s\-]+—É—Ä–æ–≤–µ–Ω—å[\s\-]+([–ê-–Ø–∞-—èA-Za-z0-9\s\-]+)',
        ]

        for pattern in patterns:
            matches = re.findall(pattern, languages_section, re.IGNORECASE)
            for lang, level in matches:
                lang = lang.strip()
                level = level.strip()
                if lang and level and len(lang) > 2:
                    languages.append({'language': lang, 'level': level})

        return languages

    def _extract_positions(self, text: str) -> List[str]:
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –¥–æ–ª–∂–Ω–æ—Å—Ç–µ–π"""
        positions = set()
        position_patterns = [
            r'(–í–µ–¥—É—â–∏–π —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç|–°—Ç–∞—Ä—à–∏–π —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç|–°–ø–µ—Ü–∏–∞–ª–∏—Å—Ç|–ò–Ω–∂–µ–Ω–µ—Ä|–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫|–ê–Ω–∞–ª–∏—Ç–∏–∫|–ú–µ–Ω–µ–¥–∂–µ—Ä|–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä)',
            r'(Senior|Junior|Lead|Principal)\s+[A-Za-z–ê-–Ø–∞-—è]+',
        ]

        for pattern in position_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if isinstance(match, tuple):
                    match = match[0]
                positions.add(match.strip())

        return list(positions)

    def _find_section(self, text: str, section_names: List[str]) -> Optional[str]:
        """–ü–æ–∏—Å–∫ —Å–µ–∫—Ü–∏–∏ –ø–æ –Ω–∞–∑–≤–∞–Ω–∏—é"""
        for name in section_names:
            pattern = rf'{name}.*?(?=\n\s*[–ê-–ØA-Z]|\n\n|$)'
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                return match.group(0)
        return None

# –î–æ–±–∞–≤–ª—è–µ–º –ø–∞—Ä—Å–µ—Ä —Ä–µ–∑—é–º–µ –≤ DocumentProcessor
class DocumentProcessor:
    def __init__(self):
        self.resume_parser = ResumeParser()
        self.uploaded_files = {}

    def upload_files(self):
        print("–ó–∞–≥—Ä—É–∑–∏—Ç–µ —Ñ–∞–π–ª—ã (DOCX –∏–ª–∏ RTF):")
        uploaded = files.upload()
        self.uploaded_files = uploaded
        return uploaded

    def upload_files(self, file_paths=None):
        """
        –ó–∞–≥—Ä—É–∂–∞–µ—Ç —Ñ–∞–π–ª—ã –ø–æ –∑–∞–¥–∞–Ω–Ω—ã–º –ø—É—Ç—è–º.
        :param file_paths: –°–ø–∏—Å–æ–∫ –ø—É—Ç–µ–π –∫ —Ñ–∞–π–ª–∞–º (—Å—Ç—Ä–æ–∫–∏) –∏–ª–∏ None (–µ—Å–ª–∏ –∑–∞–≥—Ä—É–∂–∞—Ç—å –∏–∑ —Ç–µ–∫—É—â–µ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏)
        :return: –°–ª–æ–≤–∞—Ä—å —Å –∏–º–µ–Ω–∞–º–∏ —Ñ–∞–π–ª–æ–≤ –∏ –∏—Ö —Å–æ–¥–µ—Ä–∂–∏–º—ã–º
        """
        if file_paths is None:
            # –ï—Å–ª–∏ –Ω–µ —É–∫–∞–∑–∞–Ω—ã –ø—É—Ç–∏ ‚Äî –∏—â–µ–º –≤—Å–µ .docx –∏ .rtf –≤ —Ç–µ–∫—É—â–µ–π –ø–∞–ø–∫–µ
            file_paths = []
            for ext in ['docx', 'rtf']:
                file_paths.extend([f for f in os.listdir('.') if f.lower().endswith(ext)])

        uploaded = {}
        for path in file_paths:
            if not os.path.exists(path):
                print(f"–§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω: {path}")
                continue

            try:
                filename = os.path.basename(path)
                print(f"–û–±—Ä–∞–±–∞—Ç—ã–≤–∞—é —Ñ–∞–π–ª: {filename}")

                if path.lower().endswith('.docx'):
                    doc = Document(path)
                    text = ' '.join([para.text for para in doc.paragraphs])
                elif path.lower().endswith('.rtf'):
                    text = rtf_process(path)
                else:
                    print(f"–ù–µ–ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–π —Ñ–æ—Ä–º–∞—Ç: {path}")
                    continue

                uploaded[filename] = {
                    'path': path,
                    'text': text
                }
                print(f"–ó–∞–≥—Ä—É–∂–µ–Ω: {filename}")

            except Exception as e:
                print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ {path}: {e}")

        self.uploaded_files = uploaded
        return uploaded

    def process_resume(self, file_name: str) -> Candidate:
        if file_name in self.uploaded_files:
            return self.resume_parser.parse_file(self.uploaded_files[file_name], file_name)
        else:
            raise ValueError(f"–§–∞–π–ª {file_name} –Ω–µ –Ω–∞–π–¥–µ–Ω")

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞
processor = DocumentProcessor()

In [25]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º —Ñ–∞–π–ª—ã
print("–ó–∞–≥—Ä—É–∑–∏—Ç–µ –≤–∞—à–∏ —Ñ–∞–π–ª—ã (DOCX):")
uploaded = processor.upload_files(['/kaggle/input/111111111111111111/1  .rtf'])
print("–ó–∞–≥—Ä—É–∂–µ–Ω—ã —Ñ–∞–π–ª—ã:", list(uploaded.keys()))

–ó–∞–≥—Ä—É–∑–∏—Ç–µ –≤–∞—à–∏ —Ñ–∞–π–ª—ã (DOCX):
–û–±—Ä–∞–±–∞—Ç—ã–≤–∞—é —Ñ–∞–π–ª: 1  .rtf
–û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ /kaggle/input/111111111111111111/1  .rtf: File is not a zip file
–ó–∞–≥—Ä—É–∂–µ–Ω—ã —Ñ–∞–π–ª—ã: []


In [None]:
# –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –≤–∞–∫–∞–Ω—Å–∏–∏ —Å —É–ª—É—á—à–µ–Ω–Ω—ã–º –≤—ã–≤–æ–¥–æ–º
print("–û–±—Ä–∞–±–æ—Ç–∫–∞ –≤–∞–∫–∞–Ω—Å–∏–π...")
vacancy_files = [f for f in uploaded.keys() if any(kw in f.lower() for kw in ['–≤–∞–∫–∞–Ω—Å', '–æ–ø–∏—Å–∞–Ω', 'vacanc', 'job'])]

for vacancy_file in vacancy_files:
    try:
        print(f"\n{'='*80}")
        print(f"üîç –û–ë–†–ê–ë–û–¢–ö–ê –í–ê–ö–ê–ù–°–ò–ò: {vacancy_file}")
        print(f"{'='*80}")

        vacancy = processor.process_vacancy(vacancy_file)

        print("‚úÖ –û–°–ù–û–í–ù–ê–Ø –ò–ù–§–û–†–ú–ê–¶–ò–Ø:")
        print(f"   üìã –î–æ–ª–∂–Ω–æ—Å—Ç—å: {vacancy.title or '–ù–µ —É–∫–∞–∑–∞–Ω–∞'}")
        print(f"   üèôÔ∏è –ì–æ—Ä–æ–¥: {vacancy.city or '–ù–µ —É–∫–∞–∑–∞–Ω'}")
        print(f"   üíº –¢–∏–ø –∑–∞–Ω—è—Ç–æ—Å—Ç–∏: {vacancy.employment_type or '–ù–µ —É–∫–∞–∑–∞–Ω'}")
        print(f"   üìÖ –ì—Ä–∞—Ñ–∏–∫ —Ä–∞–±–æ—Ç—ã: {vacancy.work_schedule or '–ù–µ —É–∫–∞–∑–∞–Ω'}")
        print(f"   üéì –£—Ä–æ–≤–µ–Ω—å –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è: {vacancy.education_level or '–ù–µ —É–∫–∞–∑–∞–Ω'}")
        print(f"   ‚è≥ –¢—Ä–µ–±—É–µ–º—ã–π –æ–ø—ã—Ç: {vacancy.experience_required or '–ù–µ —É–∫–∞–∑–∞–Ω'}")

        print(f"\nüìã –û–°–ù–û–í–ù–´–ï –¢–†–ï–ë–û–í–ê–ù–ò–Ø ({len(vacancy.requirements)}):")
        for i, req in enumerate(vacancy.requirements, 1):
            print(f"   {i:2d}. {req}")

        if vacancy.advantages:
            print(f"\n‚≠ê –ë–£–î–ï–¢ –ü–†–ï–ò–ú–£–©–ï–°–¢–í–û–ú ({len(vacancy.advantages)}):")
            for i, advantage in enumerate(vacancy.advantages, 1):
                print(f"   {i:2d}. {advantage}")

        print(f"\nüìù –û–ë–Ø–ó–ê–ù–ù–û–°–¢–ò ({len(vacancy.responsibilities)}):")
        for i, resp in enumerate(vacancy.responsibilities, 1):
            print(f"   {i:2d}. {resp}")

    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {vacancy_file}: {e}")
        import traceback
        traceback.print_exc()

In [None]:
# –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Ä–µ–∑—é–º–µ
print("\n–û–±—Ä–∞–±–æ—Ç–∫–∞ —Ä–µ–∑—é–º–µ...")
resume_files = [f for f in uploaded.keys() if any(kw in f.lower() for kw in ['—Ä–µ–∑—é–º–µ', 'cv', 'resume', '–æ–±—Ä–∞–∑–µ—Ü'])]

for resume_file in resume_files:
    try:
        print(f"\nüîç –û–±—Ä–∞–±–∞—Ç—ã–≤–∞—é —Ä–µ–∑—é–º–µ: {resume_file}")
        candidate = processor.process_resume(resume_file)

        print("‚úÖ –î–ê–ù–ù–´–ï –ö–ê–ù–î–ò–î–ê–¢–ê:")
        print(f"üìÖ –û–±—â–∏–π –æ–ø—ã—Ç: {candidate.experience_total or '–ù–µ —É–∫–∞–∑–∞–Ω'}")

        print(f"\nüíº –î–æ–ª–∂–Ω–æ—Å—Ç–∏ ({len(candidate.positions)}):")
        for i, pos in enumerate(candidate.positions[:5], 1):
            print(f"   {i}. {pos}")

        print(f"\nüõ†Ô∏è  –ù–∞–≤—ã–∫–∏ ({len(candidate.skills)}):")
        for i, skill in enumerate(candidate.skills[:15], 1):
            print(f"   {i}. {skill}")

        print(f"\nüåê –Ø–∑—ã–∫–∏ ({len(candidate.languages)}):")
        for i, lang in enumerate(candidate.languages, 1):
            print(f"   {i}. {lang['language']} - {lang['level']}")

    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {resume_file}: {e}")