In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from sentence_transformers import SentenceTransformer, util
import pdfplumber
import spacy
import string
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import recall_score
import pandas as pd
nlp = spacy.load("ru_core_news_sm")

In [2]:
# Грузим весь текст
def get_text_from_pdf(pdf_path: str) -> dict:
    # Берёт текст с файла в папке
    pdf_text = ""
    page_number = {} # хранит словарь вида номер_страницы:айди последнего элемента
    with pdfplumber.open(pdf_path) as pdf:
        n_pages = len(pdf.pages)
        for page_id in range(n_pages):
            page_text = pdf.pages[page_id].extract_text().replace("\n", " ")
            page_number[page_id] = page_number.get(page_id-1, 0) + len(page_text)
            pdf_text += page_text
    return pdf_text, page_number

def get_loaded_pdf(pdf):
    pdf_text = ""
    page_number = {} # хранит словарь вида номер_страницы:айди последнего элемента
    n_pages =len(pdf.pages) 
    for page_id in range(n_pages):
        page_text = pdf.pages[page_id].extract_text().replace("\n", " ")
        page_number[page_id] = page_number.get(page_id-1, 0) + len(page_text)
        pdf_text += page_text
    return pdf_text, page_number

def get_page_id(symbol_id:int, page_number:dict) -> int:
    for page_id in range(len(page_number)):
        if symbol_id <= page_number[page_id]:
            return page_id
    return -1

In [3]:
%%time
# Версия создания финального df с NER. Будем накладывать NER после первого преобразования

THRESHOLD = 0.5

def get_nearest_similarities(text: str, standard_sub_text: str) -> list[str]:
    text_len = len(text)
    window_len = len(standard_sub_text) + 1
    sub_texts = []
    for i in range(text_len):
        sub_texts.append(text[i:i+window_len])
    tfidf = TfidfVectorizer()
    mx_tf = tfidf.fit_transform(sub_texts)
    entry = tfidf.transform([standard_sub_text])
    cosine_similarities = linear_kernel(entry, mx_tf).flatten()
    df = pd.DataFrame({'texts': sub_texts})
    df['cos_similarities'] = cosine_similarities
    df = df[df.cos_similarities > 0]
    #df['ner_similarities'] = df.texts.apply(lambda text: nlp(text).similarity(nlp(standard_sub_text)))
    
    return df

# Преобразуем в удобный вид
def get_final_df(df, reference_seq, page_number, whole_text):
    final_df = df.copy()

    final_df["starts_at"] = final_df.index
    final_df["ends_at"] = final_df["starts_at"] + final_df["texts"].apply(lambda text: len(text))

    final_df["start_page_id"] = final_df.starts_at.apply(lambda symbol_id: get_page_id(symbol_id, page_number) + 1)
    final_df["end_page_id"] = final_df.ends_at.apply(lambda symbol_id: get_page_id(symbol_id, page_number) + 1)

    final_df = final_df[final_df["cos_similarities"] > THRESHOLD]
    
    final_df = final_df.reset_index()
    final_df = final_df[final_df['index'] + len(reference_seq) < final_df['index'].shift(-1)]
    
    docs = list(nlp.pipe(final_df['texts']))
    standard_doc = nlp(reference_seq)
    final_df['ner_similarities'] = [doc.similarity(standard_doc) for doc in docs]
        
    #  Ищем откуда начинается фраза в текста
    padding = max(10, int(len(reference_seq) * 0.4))
    window_overlap = max(min(15, int(len(reference_seq) * 0.1)), 1)
    for ind in final_df["index"]:
        min_new = max(0, ind - padding)
        max_new = min(len(whole_text), ind + padding)
        best_id = ind
        best_sim = 0
        f_skip = 0
        for i in range(min_new, max_new):
            # Сначала пробуем найти полное совпадение в начале эталонного текста
            #print(whole_text[i:i+window_overlap].lower())
            if reference_seq.lower().startswith(whole_text[i:i+window_overlap].lower()):
                best_sim = 1
                best_id = i
                f_skip = 1
                
        if not f_skip:
            # Если не получилось поймать полное совпадение, ищем лучшее наложение
            for i in range(min_new, max_new):
                sentence = whole_text[i:i+len(reference_seq) + 1].lower()
                similarity = nlp(sentence).similarity(nlp(reference_seq.lower()))
                if similarity > best_sim:
                    best_sim = similarity
                    best_id = i
        final_df.loc[final_df["index"] == ind, "cos_similarities"] = best_sim
        final_df.loc[final_df["index"] == ind, "texts"] = whole_text[best_id:best_id+len(reference_seq) + 1]
        final_df.loc[final_df["index"] == ind, "index"]= best_id  
    final_df["mask"] = final_df["texts"].apply(lambda x: get_diff_mask(reference_seq,x ))
    
    return final_df

# reference_seq = """ВТОРОЙ ПУТЬ НА ПЕРЕГОНЕ КИЗИР-ЖУРАВЛЕВО КРАСНОЯРСКОЙ ЖЕЛЕЗНОЙ ДОРОГИ"""
# reference_seq = reference_seq.replace("\n", " ")
# fila_name="Кузб-183267_КРАС–ИЭИ1_изм.6.00256-21_КРЭ-26756"
# whole_text, page_number = get_text_from_pdf(f"ПД для ии/{fila_name}.pdf")

# df = get_nearest_similarities(whole_text, reference_seq)

# final_df = get_final_df(df, reference_seq, page_number, whole_text)
# final_df

Wall time: 0 ns


In [4]:
# ещё один вариант поиска расстояний

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# vectorizer = TfidfVectorizer()
# padding = int(len(reference_seq) * 0.25)
# for ind in final_df["index"][14:]:
#     print(ind)
#     min_new = max(0, ind - padding)
#     max_new = min(len(whole_text), ind + padding)
#     for i in range(min_new, max_new):
#         sentence = whole_text[i:i+len(reference_seq) + 1].lower()
#         tfidf = vectorizer.fit_transform([sentence, reference_seq.lower()])
#         similarity = cosine_similarity(tfidf[0], tfidf[1])[0][0]
#         print(similarity, i,sentence, reference_seq)
#     break


# # fit and transform the sentences
# tfidf = vectorizer.fit_transform([sentence1, sentence2])

# # calculate the cosine similarity between the sentences
# similarity = cosine_similarity(tfidf[0], tfidf[1])[0][0]

In [5]:
# reference_seq = """ПЕРВЫЙ ПУТЬ НА ПЕРЕГОНИ КИЗИР-ЖУРАВЛЕВО КРАСНОЯРСКОЙ ЖЕЛЕЗНОЙ ДОРОГИ"""
# whole_text, page_number = get_text_from_pdf("ПД для ии/Кузб-183267_КРАС–ИЭИ1_изм.6.00256-21_КРЭ-26756.pdf")

# def get_nearest_similarities(text: str, standard_sub_text: str) -> list[str]:
#     text_len = len(text)
#     window_len = len(standard_sub_text)
#     sub_texts = []
#     for i in range(text_len):
#         sub_texts.append(text[i:i+window_len])
#     tfidf = TfidfVectorizer()
#     mx_tf = tfidf.fit_transform(sub_texts)
#     entry = tfidf.transform([standard_sub_text])
#     cosine_similarities = linear_kernel(entry, mx_tf).flatten()
#     df = pd.DataFrame({'texts': sub_texts})
#     df['cos_similarities'] = cosine_similarities
#     df = df[df.cos_similarities > 0]
#     #df['ner_similarities'] = df.texts.apply(lambda text: nlp(text).similarity(nlp(standard_sub_text)))
    
#     return df

#df = get_nearest_similarities(whole_text, reference_seq)

In [6]:
# # Преобразуем в удобный вид

# def get_final_df(df, reference_seq, page_number):
#     class Cumulative:
#         def __init__(self):
#             self.val = 0
#         def increase(self):
#             self.val += 1
#             return self.val

#     cumulative = Cumulative()
#     df["id"] = df.index
#     seq_len = len(reference_seq)
#     df["seq"] = df.id.apply(lambda x: x + 2*seq_len)
#     df["seq_lagged"] = df.seq.shift(1)
#     df["seq_id"] = (df.id > df.seq_lagged).apply(lambda x: cumulative.increase() if x else cumulative.val)
#     seq_ids = df[["texts", "cos_similarities", "seq_id"]].seq_id.unique()

#     best_similarities = []
#     for seq_id in seq_ids:
#         seq_df = df[df.seq_id==seq_id]
#         max_cos_seq_df = seq_df[seq_df.cos_similarities==seq_df.cos_similarities.max()]
#         best_similarities.append(max_cos_seq_df[["texts", "cos_similarities"]].head())

#     final_df = pd.concat(best_similarities)

#     final_df["starts_at"] = final_df.index
#     final_df["ends_at"] = final_df["starts_at"] + final_df["texts"].apply(lambda text: len(text))

#     final_df["start_page_id"] = final_df.starts_at.apply(lambda symbol_id: get_page_id(symbol_id, page_number))
#     final_df["end_page_id"] = final_df.ends_at.apply(lambda symbol_id: get_page_id(symbol_id, page_number))

# #     final_df = final_df[final_df["cos_similarities"] > 0.9]
#     return final_df

# # final_df = get_final_df(df, reference_seq, page_number)
# # final_df

In [7]:
# Нахождение различий между двумя текстами
def find_difference(ref_text, file_text):
    braces = ["'", '"', "«", "»"]
    punctuation_marks = list(string.punctuation)
    
    letter_mismatch = []
    uppercase_mismatch = []
    braces_mismatch = []
    punctuation_mismatch = []
    for i in range(len(ref_text)):
        if ref_text[i] != file_text[i]:
            if ref_text[i].lower() == file_text[i].lower():
                uppercase_mismatch.append(i)    
            elif ref_text[i] in braces and file_text[i] in braces:
                braces_mismatch.append(i)
            elif ref_text[i] in punctuation_marks and file_text[i] in punctuation_marks:
                punctuation_mismatch.append(i)
            else:
                letter_mismatch.append(i)
  
    return_text = "Были обнаружены ошибки видов: "
    if letter_mismatch:
        return_text +=  f"""символьное несовпадение,"""
    if uppercase_mismatch:  
        return_text += f""" ошибка в регистре,"""
    if braces_mismatch:
        return_text += f""" неправильные скобки,"""
    if punctuation_mismatch:
        return_text += f""" ошибка в пунктуации,"""
    return_text = return_text[:-1]
    if not uppercase_mismatch and not letter_mismatch:
        return_text = "Ошибки отсутсвтуют"

    return return_text
        
#find_difference("ВТОРОЙ ПоТЬ НА ПЕРЕГОНЕ КИЗИР-ЖУРАВЛЕВО КРАСНОЯРСКОЙ ЖЕЛЕЗНОЙ ДОРОГИ", "Второй путь на перегоне Кизир-Журавлево Красноярской железной дороги")

In [8]:
def get_diff_mask(standard_text: str, matched_text: str) -> str:
    mask_len = len(standard_text)
    diff_mask = "".join([matched_text[char_id] if matched_text[char_id]==standard_text[char_id] else "_" for char_id in range(mask_len)])
    return diff_mask

def parse_model_results(df: pd.DataFrame, standard_text: str, k: float = 0.5) -> dict:
    n_matches = df.shape[0]
    matches_info = []
    for match_id in range(n_matches):
        matched_object = df.iloc[match_id]
        match_info = {}
        match_info["Найденное совпадение"] = matched_object.texts
        match_info["Отличия"] = get_diff_mask(str(standard_text), str(matched_object.texts))
        match_info["Степень сходства Cosine similarity"] = matched_object.cos_similarities
        match_info["Степень сходства Spacy matching"] = matched_object.ner_similarities
        match_info["Страницы"] = [*set([matched_object.start_page_id, matched_object.end_page_id])]
        matches_info.append(match_info)
    return matches_info


In [None]:
import re
import pdfplumber
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QLineEdit, QLabel, QFileDialog, QTableWidget, QTableWidgetItem
import nltk

class MyWidget(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.label = QLabel('Файлы:', self)
        self.file_label = QLabel(self)
        self.table = QTableWidget(self)
        self.table.setColumnCount(6)
        self.table.setHorizontalHeaderLabels(
            ['Имя файла', 'Страница', 'Виды ошибок', 'Эталонный текст', 'Найденный текст', 'Маска'])
        self.check_button = QPushButton('Проверить файлы', self)
        self.check_button.clicked.connect(self.check_files)
        self.remove_button = QPushButton('Убрать файлы', self)
        self.remove_button.clicked.connect(self.remove_files)
        self.search_input = QLineEdit(self)
        
        layout = QVBoxLayout()
        layout.addWidget(self.search_input)
        layout.addWidget(self.label)
        layout.addWidget(self.file_label)
        layout.addWidget(self.table)
        layout.addWidget(self.check_button)
        layout.addWidget(self.remove_button)

        self.setLayout(layout)

        self.setStyleSheet('''
            QWidget {
                background-color: #eef8fa;
                color: #6e6e6e;
            }
            QPushButton {
                background-color: #ffe5df;
                color: #6e6e6e;
                border: none;
                padding: 5px;
                margin: 5px;
            }
            QLineEdit {
                padding: 5px;
                margin: 5px;
            }
            QLabel {
                margin: 5px;
            }
            QTableWidget {
                margin: 5px;
            }
        ''')
        
    def choose_files(self):
        file_dialog = QFileDialog()
        file_dialog.setFileMode(QFileDialog.ExistingFiles)
        if file_dialog.exec_():
            file_names = file_dialog.selectedFiles()
            self.file_label.setText('\n'.join(file_names))
            self.table.setRowCount(0)
            reference_seq = self.search_input.text().replace("\n", "")  
            padding = 15
            for file_name in file_names:
                with pdfplumber.open(file_name) as pdf: 
                    whole_text, page_number = get_loaded_pdf(pdf) # Получаем текст из пдф reference_seq
                    df = get_nearest_similarities(whole_text, reference_seq) # создаём df со всеми cos similiarity
                    final_df = get_final_df(df, reference_seq, page_number, whole_text) # выполняем преобразования по дф
                    for index, row in final_df.iterrows():
                        left_b = max(0, int(row["index"])-padding)
                        left_text = whole_text[left_b: row["index"]] + "|" 
                        res_text_ = whole_text[row["index"]:row["index"]+len(reference_seq)+1]
                        right_b = row["index"]+padding+len(reference_seq)
                        right_text = "|" + whole_text[row["index"]+len(reference_seq) + 1:right_b]
                        res_text_ = left_text + res_text_ + right_text 
                        #print(left_b, right_b, whole_text)
                        res = find_difference(reference_seq, row["texts"])
                        self.table.insertRow(self.table.rowCount())
                        self.table.setItem(self.table.rowCount() - 1, 0, QTableWidgetItem(file_name))
                        self.table.setItem(self.table.rowCount() - 1, 1, QTableWidgetItem(str(row["end_page_id"])))
                        self.table.setItem(self.table.rowCount() - 1, 2, QTableWidgetItem(res))
                        self.table.setItem(self.table.rowCount() - 1, 3, QTableWidgetItem(reference_seq))  
                        self.table.setItem(self.table.rowCount() - 1, 4, QTableWidgetItem(res_text_)) 
                        self.table.setItem(self.table.rowCount() - 1, 5, QTableWidgetItem(row["mask"])) 

    def remove_files(self):
        self.file_label.clear()
        self.table.setRowCount(0)

    def check_files(self):
        self.table.setRowCount(0)
        self.choose_files()


if __name__ == '__main__':
    app = QApplication([])
    widget = MyWidget()
    widget.show()
    app.exec_()


