In [1]:
#%pip install python-docx

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os
import torch
import docx
from sklearn.metrics.pairwise import cosine_similarity
import re

In [8]:
model_path = 'OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [9]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
config = BitsAndBytesConfig(
    load_in_4bit=True,
    torch_dtype=torch.float16,
    bnb_4bit_compute_dtype=torch.float16,
    use_flash_attention_2=False
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    trust_remote_code=True,
    quantization_config=config
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [23]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    topics = {}
    current_topic = ""
    current_subtopic = ""
    for para in doc.paragraphs:
        if para.text.strip():
            if para.text[0].isdigit() and '.' in para.text:  # Subtopic
                current_subtopic = para.text.strip()
                topics[current_subtopic] = {
                    "keywords_all": [],
                    "keywords_any": [],
                    "exclude": []
                }
            elif para.text.startswith("Ключевые слова одновременно необходимые для поиска (и):"):
                keywords = para.text.replace("Ключевые слова одновременно необходимые для поиска (и):", "").strip().split(';')
                topics[current_subtopic]["keywords_all"].extend([k.strip() for k in keywords])
            elif para.text.startswith("Ключевые слова при поиске которых достаточно совпадения одного слова или словосочетания (или):"):
                keywords = para.text.replace("Ключевые слова при поиске которых достаточно совпадения одного слова или словосочетания (или):", "").strip().split(';')
                topics[current_subtopic]["keywords_any"].extend([k.strip() for k in keywords])
            elif para.text.startswith("Исключить слова:"):
                exclude_words = para.text.replace("Исключить слова:", "").strip().split(';')
                topics[current_subtopic]["exclude"].extend([e.strip() for e in exclude_words])
    return topics

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        embeddings = outputs.hidden_states[-1][:,0,:].squeeze().cpu().numpy()
    return embeddings

def classify_document(content, topics):
    content_embedding = get_embedding(content)
    max_similarity = -1
    best_topic = "Unclassified"

    for topic, rules in topics.items():
        topic_keywords = " ".join(rules["keywords"])
        topic_embedding = get_embedding(topic_keywords)
        similarity = cosine_similarity([content_embedding], [topic_embedding])[0][0]
        
        if similarity > max_similarity:
            max_similarity = similarity
            best_topic = topic

    return best_topic

def sanitize_folder_name(name):
    # Удаляем или заменяем недопустимые символы
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        name = name.replace(char, '')
    return name

def distribute_documents(doc_folder, docx_file):
    topics = read_docx(docx_file)
    unclassified_folder = os.path.join(doc_folder, 'unclassified')
    if not os.path.exists(unclassified_folder):
        os.makedirs(unclassified_folder)
    for root, dirs, files in os.walk(doc_folder):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                content = read_txt(file_path)
                topic = classify_document(content, topics)
                sanitized_topic = sanitize_folder_name(topic)
                topic_folder = os.path.join(doc_folder, sanitized_topic)
                if not os.path.exists(topic_folder):
                    os.makedirs(topic_folder)
                new_file_path = os.path.join(topic_folder, filename) if topic != "Unclassified" else os.path.join(unclassified_folder, filename)
                os.rename(file_path, os.path.join(topic_folder, new_file_path))

In [29]:
# ФУНКИЯ НАПИСАНА ДЛЯ ПРОДОЛЖЕНИЯ КЛАССИФИКАЦИИ, В СЛУЧАЕ ЕСЛИ МЫ ПРЕРВАЛИ ЕЕ
def distribute_documents(doc_folder, docx_file):
    topics = read_docx(docx_file)
    folder_pattern = re.compile(r'^\d{4}_\d{2}$')
    for folder in os.listdir(doc_folder):
        folder_path = os.path.join(doc_folder, folder)
        if os.path.isdir(folder_path) and folder_pattern.match(folder):  # Проверка соответствия формату NNNN_NN и является ли это папкой
            for root, _, files in os.walk(folder_path):
                for filename in files:
                    if filename.endswith(".txt"):
                        file_path = os.path.join(root, filename)
                        content = read_txt(file_path)
                        topic = classify_document(content, topics)
                        sanitized_topic = sanitize_folder_name(topic)
                        topic_folder = os.path.join(doc_folder, sanitized_topic)
                        if not os.path.exists(topic_folder):
                            os.makedirs(topic_folder)
                        os.rename(file_path, os.path.join(topic_folder, filename))

In [None]:
# Пример использования
doc_folder = r"C:\Users\User\Documents\data_full" #папка с документами для распределения
docx_file = r"C:\Users\User\Documents\docs_from_load\2_5463026898172925745.docx" #файл docx с правилами распределения
distribute_documents(doc_folder, docx_file)