# Константы

In [None]:
API_KEY = "" # OPEN_AI_API_KEY

TRANSCRIBED_FOLDER_PATH = '/content/drive/My Drive/ВКР/multiple-talk-2024-10-27/transcribed/'
FULLY_TRANSCRIBED_FOLDER_PATH = '/content/drive/My Drive/ВКР/multiple-talk-2024-10-27/fully_transcribed/'

WINDOWS_1251_ENCODING = 'windows-1251'
UTF8_ENCODING = 'utf-8'

AUDIO_FILE_FOLDER_PATH = '/content/drive/My Drive/ВКР/multiple-talk-2024-10-27/'
BASE_TRANSCRIBED_TEXT_PATH = '/content/drive/My Drive/ВКР/text/'

DIARIZED_OUTPUT_FOLDER_PATH = '/content/drive/My Drive/ВКР/multiple-talk-2024-10-27/diarized/'
OPEN_API_TRNSCRIBE_URL = 'https://api.openai.com/v1/audio/transcriptions'
OPEN_API_AUTH_HEADERS = headers = { 'Authorization': f'Bearer {API_KEY}' }

# Функции для работы с Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import auth
from googleapiclient.discovery import build
import requests
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
import io

auth.authenticate_user()
drive_service = build('drive', 'v3')

def get_folder_id(folder_path):
    folder_name = folder_path.split('/')[-1]
    parent_folder_id = None

    folders = folder_path.split('/')[:-1]

    for folder in folders:
        query = f"mimeType='application/vnd.google-apps.folder' and name='{folder}'"
        if parent_folder_id:
            query += f" and '{parent_folder_id}' in parents"

        results = drive_service.files().list(q=query, pageSize=10, fields="files(id, name)").execute()
        items = results.get('files', [])

        if items:
            parent_folder_id = items[0]['id']
    return parent_folder_id

def list_files_in_folder(folder_id):
    results = drive_service.files().list(
        q=f"'{folder_id}' in parents",
        fields="nextPageToken, files(id, name)",
        pageSize=1000
    ).execute()
    items = results.get('files', [])

    return items

def download_audio_file(file_id):
    request = drive_service.files().get_media(fileId=file_id)
    file_metadata = request.execute()

    downloaded_file = io.BytesIO()
    downloader = MediaIoBaseDownload(downloaded_file, request)

    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}%.")
    return downloaded_file

def save_text_to_drive(file_name, file_text):
    with open(file_name, 'w', encoding=WINDOWS_1251_ENCODING) as file:
        file.write(json.dumps(file_text, indent=4, ensure_ascii=False))

def get_audio_session_id(file_name):
    session_name = None
    is_session_name = False
    for part in file_name.split('_'):
        if is_session_name:
            session_name = part
            break
        if part == 'session':
            is_session_name = True

    if session_name == None:
        raise ValueError("Не удалось получить имя сессии диалога")
    return session_name

# Функции для работы с Open AI

In [None]:
import requests
import openai
import io
import json

def transcribe_audio(file_id, session_id):

    downloaded_file = download_audio_file(file_id)
    downloaded_file.seek(0)

    files = {
        'file': ('audio_file.wav', downloaded_file, 'audio/mpeg'),
        'model': (None, 'whisper-1'),
        'language': (None, 'ru'),
        'response_format': (None, 'verbose_json')
    }

    response = requests.post(OPEN_API_TRNSCRIBE_URL, headers=OPEN_API_AUTH_HEADERS, files=files)

    new_file_name = f"{TRANSCRIBED_FOLDER_PATH}{session_id}.txt"
    save_text_to_drive(new_file_name, response.json())


# Процесс транскрибации

In [None]:
def read_all_audio_files():
    loaded_files = []

    for i in range(1, 8):
        folder_path = f'{AUDIO_FILE_FOLDER_PATH}{i}/'
        folder_id = get_folder_id(folder_path)

        if folder_id:
            files = list_files_in_folder(folder_id)
            if len(files) < 1:
                continue
            print("Файлы в папке:")
            loaded_files.extend(files)
            for file in files:
                print(f"ID: {file['id']}, Имя: {file['name']}")
        else:
            raise ValueError('Папка не найдена')
    return loaded_files

Считываем все файлы в один список

In [None]:
loaded_files = read_all_audio_files()
if len(loaded_files) < 1:
    raise ValueError('Не найдены аудил файлы')

total = len(loaded_files)

Транскрибация файлов

In [None]:
counter = 0
for file in loaded_files:
    session_part = get_audio_session_id(file['name'])
    try:
        transcribe_audio(file['id'], session_part)
    except Exception as e:
        print(f"Возникла ошибка: {e}")
    counter += 1
    print(f'Обработано: {counter} из {total}')

# Оценка транскрибации

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import json
import re

def preprocess_base_text(text):
    # Убрать даты
    timestamp_pattern = r'\b\d{1,2}:\d{2}:\d{2}\b'

    cleaned_text = re.sub(timestamp_pattern, '', text)
    # Убрать лишние символы и пробелы
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return ' '.join(cleaned_text.strip().lower().split())

def preprocess_transcribed_text(text):
    # Преобразовать в json
    data = json.loads(text)
    # Достать транскрибированный текст
    diarized_text = data.get("text")
    # Удалить лишние сиволы и пробелы
    cleaned_text = re.sub(r'[^\w\s]', '', diarized_text)
    return ' '.join(cleaned_text.strip().lower().split())

def calculate_metrics(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()

    reference_set = set(reference_words)
    hypothesis_set = set(hypothesis_words)

    true_positives = len(reference_set.intersection(hypothesis_set))
    false_positives = len(hypothesis_set - reference_set)
    false_negatives = len(reference_set - hypothesis_set)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def evaluate_text(reference_text, hypothesis_text):
    result_list = []
    print(reference_text)

    precision, recall, f1 = calculate_metrics(reference_text, hypothesis_text)
    result_list.append(f'{precision:.2f}')
    result_list.append(f'{recall:.2f}')
    result_list.append(f'{f1:.2f}')
    return result_list

def get_all_files(folder_path):
    loaded_files = []
    folder_id = get_folder_id(folder_path)

    if folder_id:
        files = list_files_in_folder(folder_id)
        print(f'{len(files)} files in the folder')
        loaded_files.extend(files)
    else:
        raise ValueError("Папка не найдена")
    return loaded_files

def read_file_text(file_id, encoding):
    request = drive_service.files().get_media(fileId=file_id)
    response = request.execute()
    return response.decode(encoding)



Загрузка эталонных и транскрибированных текстов

In [None]:
transcribed_files = get_all_files(TRANSCRIBED_FOLDER_PATH)
base_files = get_all_files(BASE_TRANSCRIBED_TEXT_PATH)

317 files in the folder
204 files in the folder


Оценка качества транскрибации

In [None]:
import pandas as pd

columns = ['Имя файла', 'Точность', 'Полнота', 'F1-мера']
df = pd.DataFrame(columns=columns)

for test_file in transcribed_files:
    try:
        transcribed_text = preprocess_transcribed_text(read_file_text(test_file['id'], WINDOWS_1251_ENCODING))

        base_text_file = None
        for base_file in base_files:
            if base_file['name'] == test_file['name'].replace('Копия ', ''):
                base_text_file = base_file
                break

        if base_text_file == None:
            print(f'Не найден эталонный текст для {test_file["name"]}')
            continue

        base_text = preprocess_base_text(read_file_text(base_text_file['id'], UTF8_ENCODING))

        precision, recall, f1 = calculate_metrics(base_text, transcribed_text)

        result_row = {'Имя файла': test_file['name'],
                      'Точность': f'{precision:.2f}',
                      'Полнота': f'{recall:.2f}',
                      'F1-мера': f'{f1:.2f}'}

        df = pd.concat([df, pd.DataFrame([result_row])], ignore_index=True)
    except Exception as e:
        print(f"Ошибка получения файла {e}")

Вывод результатов оценки качества транскрибации

In [None]:
df
df[df['Точность'].astype(float) > 0]
precision = df['Точность'].astype(float).median()
df[df['Полнота'].astype(float) > 0]
recall = df['Полнота'].astype(float).median()
df[df['F1-мера'].astype(float) > 0]
f1 = df['F1-мера'].astype(float).median()
print(f'Точность: {precision}')
print(f'Полнота: {recall}')
print(f'F1-мера: {f1}')

Точность: 0.8
Полнота: 0.74
F1-мера: 0.75


При анализе транскрибации на основе представленных метрик (точность, полнота и F1-мера) получились следущие значения:

Общие характеристики метрик:

*   Точность (Precision): 0.8
*   Полнота (Recall): 0.74
*   F1-мера: 0.75

**Заключение:**

В целом, результаты проверки качества транскрибации показывают, что модель демонстрирует хорошие показатели точности.

# Диаризация

In [None]:
! pip install pydub
! pip install pyannote.audio
! pip install SpeechRecognition
! pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip

In [None]:
from pyannote.audio import Pipeline
import torch

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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")
pipeline.to(torch.device("cuda"))

config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x7e8c9eb47d60>

In [None]:
def convert_to_wav(mp3_audio_data):
    print("Конвертация файла в wav формат")

    audio = mp3_audio_data

    mp3_audio = AudioSegment.from_file(audio, format="mp3")

    output_file = "output.wav"

    mp3_audio.export(output_file, format="wav")

    print(f'Файл успешно сконвертирован в WAV: {output_file}')
    return output_file

In [None]:
class DiarizedText:
    def __init__(self, speaker, start, end, text):
        self.name = speaker
        self.start = start
        self.end = end
        self.text = text

    def to_string(self):
        print(f'{self.name}: {self.text}')

    def full_to_string(self):
        return f'{self.name}: {self.start} - {self.end}. text : {self.text}'

    def to_dict(self):
        return {
            'name': self.name,
            'start': self.start,
            'end': self.end,
            'text': self.text
        }

class DiarizedTextEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, DiarizedText):
            return {
                'name': obj.name,
                'start': obj.start,
                'end': obj.end,
                'text': obj.text
            }
        return json.JSONEncoder.default(self, obj)

# Процесс диаризации

In [None]:
loaded_files = read_all_audio_files()

In [None]:
import json
from pydub import AudioSegment

for file in loaded_files:
    print(f'Диаризация {file["name"]}')
    session_name = get_audio_session_id(file['name'])

    downloaded_file = download_audio_file(file['id'])
    downloaded_file.seek(0)

    output_path = convert_to_wav(downloaded_file)

    diarization = pipeline(output_path)
    diarized_list = []

    for turn, _, speaker in diarization.itertracks(yield_label=True):
        diarized_list.append(DiarizedText(speaker, f'{turn.start:.1f}', f'{turn.end:.1f}', ''))
    diarized_texts_dict = [dt.to_dict() for dt in diarized_list]

    json_string = json.dumps(diarized_texts_dict, ensure_ascii=False, indent=4)

    save_text_to_drive(f'{DIARIZED_OUTPUT_FOLDER_PATH}{session_name}.txt', json_string)

Сопоставление результатов диаризации и транскрибации

In [None]:
transcribed_files = get_all_files(TRANSCRIBED_FOLDER_PATH)
diarized_files = get_all_files(DIARIZED_OUTPUT_FOLDER_PATH)

317 files in the folder
261 files in the folder


In [None]:
def get_diarized_file(file_name):
    pared_diarized_file = None
    for diarized_file in diarized_files:
        if diarized_file['name'] == file_name:
            pared_diarized_file = diarized_file
            break

    return pared_diarized_file
count = 0
for transcribed_file in transcribed_files:
    pared_diarized_file = get_diarized_file(transcribed_file['name'])
    if pared_diarized_file == None:
          print(f'Не найдена диаризация дла {transcribed_file["name"]}')
          continue

    transcribed_json = read_file_text(transcribed_file['id'], WINDOWS_1251_ENCODING)
    diarized_json = read_file_text(pared_diarized_file['id'], UTF8_ENCODING)

    transcribed_data = json.loads(transcribed_json)
    diarized_data = json.loads(diarized_json.replace('\\n', '').replace('\\\"', '\"')[1:-1])
    parsed_segments = []
    diarized_rows = []
    for segment in transcribed_data['segments']:
        parsed_segments.append(DiarizedText('', segment['start'], segment['end'], segment['text']))

    for entry in diarized_data:
        diarized_rows.append(DiarizedText(entry['name'], entry['start'], entry['end'], ''))

    speakers = []
    prev_speaker = None

    processed_speakers = []
    processed_segments = []

    for speaker in diarized_rows:
        processed = False
        for segment in parsed_segments:
            if segment in processed_segments:
                continue
            if float(speaker.start) >= float(segment.start) and float(speaker.end) < float(segment.end):
                speakers.append(DiarizedText(speaker.name, speaker.start, speaker.end, segment.text))
                processed_segments.append(segment)
                processed = True
                break

        if processed:
            continue

        for segment in parsed_segments:
            if segment in processed_segments:
                continue
            if float(speaker.start) > float(segment.start):
                speakers.append(DiarizedText(speaker.name, speaker.start, speaker.end, segment.text))
                processed_segments.append(segment)
                processed_speakers.append(speaker)

    for s in speakers:
        print(s.full_to_string())
    json_string = json.dumps(speakers, cls=DiarizedTextEncoder, indent=4, ensure_ascii=False)
    save_text_to_drive(f'{FULLY_TRANSCRIBED_FOLDER_PATH}{transcribed_file["name"]}', json_string)
    count += 1
    print(count)

76
87
: 0.0 - 5.0. text :  Алло. Как вы?
: 5.0 - 9.0. text :  Займись частным распытом. У вас к нему срочный вопрос?
: 9.0 - 11.0. text :  Очень срочный.
: 11.0 - 21.0. text :  Пока что абонент вне сети, но я записала для него ваш ответ. Попросите его вам перезвонить, когда он вернется в сеть.
: 22.0 - 35.0. text :  Да, конечно, перезвоните. Я по поводу возможности встречи онлайн моего руководства с вами. Тема встречи это верификация жребщиков без участия ваших сотрудников.
: 35.0 - 45.0. text :  Да я не договорил, блин. Тема встречи это верификация жребщиков без участия ваших сотрудников. Как мы сделали это в Манимейде, Лайнзайме, Веббанкире и других средствах.
: 45.0 - 49.0. text :  Подскажите, вы звоните по личному или по деловому вопросу?
: 49.0 - 56.0. text :  Вот этот вопрос можно передать добителю.
: 56.0 - 62.0. text :  Может быть передать абоненту в какое время можно вам перезвонить?
: 62.0 - 67.0. text :  Скажите погромче, я вас не услышала.
: 67.0 - 71.0. text :  Повторите п

# Оценка диаризации

In [None]:
fully_transcribed_files = get_all_files(FULLY_TRANSCRIBED_FOLDER_PATH)
base_files = get_all_files(BASE_TRANSCRIBED_TEXT_PATH)

261 files in the folder
204 files in the folder


In [None]:
import re
import pandas as pd

columns = ['Имя файла', 'Разница с эталоном']

df = pd.DataFrame(columns=columns)
counter = 0
for transcribed in fully_transcribed_files:
    base_file = None
    for search_base_file in base_files:
        if search_base_file['name'] ==  transcribed['name']:
              base_file = search_base_file
              break
    if base_file == None:
        # print(f'Не найден исходный файл для {transcribed["name"]}')
        continue

    base_file = read_file_text(base_file['id'], UTF8_ENCODING)
    base_file = re.split(r'\b\d{1,2}:\d{2}:\d{2}\b', base_file)
    base_file = list(map(lambda s: s.replace('\n', '').replace('\r', ''), base_file))
    base_file = [s for s in base_file if s != '']
    transcribed_json = read_file_text(transcribed['id'], WINDOWS_1251_ENCODING).replace('\\n', '').replace('\\\"', '\"')[1:-1].replace('  ', '')
    transcribed_data = json.loads(transcribed_json)

    transcribed_list = []

    prev_speaker = None
    text_to_join = ''
    segment_start_to_join = None

    diff = abs(len(base_file) - len(transcribed_data))
    result_row = {'Имя файла': transcribed['name'],
                  'Разница с эталоном': f'{diff}'}

    df = pd.concat([df, pd.DataFrame([result_row])], ignore_index=True)


In [None]:
df
mean_diff = df['Разница с эталоном'].astype(float).median()
print(f'Mean diff {mean_diff}')
df = df.sort_values(['Разница с эталоном'], ascending=False)
df

# Получение эмбеддингов

Основные классы для работы

In [None]:
import asyncio
import os
from typing import List, Dict

import numpy as np
from openai import OpenAI
from openai import AsyncOpenAI
from sklearn.preprocessing import StandardScaler
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize


class TextProcessor:
    def __init__(self, api_key: str = ""):
        self._client = AsyncOpenAI(api_key=api_key)
        self._embed_model = "text-embedding-3-small"
        self._chat_model =  "gpt-3.5-turbo"
        self.dimentions = 256
        self.chunk_size = 128

    async def get_text_embedding(self, text_list: List[str]) -> np.ndarray:

        for ind, text in enumerate(text_list):
            text = text.replace("\n", " ")
            if not text:
                text = 'other'
            text_list[ind] = text

        try:
            tasks = []
            for i in range(0, len(text_list), self.chunk_size):
                text_chunk = text_list[i:i + self.chunk_size]

                task = asyncio.create_task(self._client.embeddings.create(
                    input=text_chunk,
                    model=self._embed_model,
                    dimensions=self.dimentions,
                ))
                tasks.append(task)
            tasks_results = await asyncio.gather(*tasks)

            embeddings = []
            for result in tasks_results:
                for data in result.data:
                    embeddings.append(np.array(data.embedding))

        except Exception as e:
            print(e)
            return np.zeros(self.dimentions)  # TODO!!!
        return np.stack(embeddings, axis=0)

    async def get_text_completions(self, system_prompt: str, text_dict: Dict[str, str], return_json=False):
        for key, text in text_dict.items():
            if not text:
                print('Found empty text prompt for document!')
                text_dict[key] = 'empty'

        text_items = list(text_dict.items())
        text_ids = [item[0] for item in text_items]
        text_list = [item[1] for item in text_items]

        response_format = None
        if return_json:
            response_format = {"type": "json_object"}

        tasks = []
        for text in text_list:
            task = asyncio.create_task(self._client.chat.completions.create(
                model=self._chat_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": text}
                ],
                response_format=response_format,
            ))
            tasks.append(task)

        tasks_results = await asyncio.gather(*tasks)

        predictions = []
        for response in tasks_results:
            answer = ''
            for choice in response.choices:
                answer += choice.message.content
            predictions.append(answer)

        return {text_id: pred for text_id, pred in zip(text_ids, predictions)}


class DimentionReduction:

    def __init__(self):
        self.umap = UMAP(
            n_components=2,
            n_neighbors=5,
            min_dist=0.01,
            spread=4.0,
            metric='euclidean',
            random_state=42,
        )
        self.pca = PCA(n_components=2)
        self.scaler = StandardScaler()

    def apply(self, embeddings: np.ndarray) -> np.ndarray:
        embeddings = self.scaler.fit_transform(embeddings)

        if len(embeddings) < 10:
            return self.scaler.fit_transform(self.pca.fit_transform(embeddings))

        return self.scaler.fit_transform(self.umap.fit_transform(embeddings))


class ClustersReduction:
    """-1 значит точка выброс"""
    def __init__(self):
        self._hdb = hdbscan.HDBSCAN(
            min_samples=3,
            min_cluster_size=2,
            metric='euclidean',
            cluster_selection_epsilon=0.1,
        )

    def apply(self, embeddings: np.ndarray):
        embeddings = normalize(embeddings, norm='l2')
        model = self._hdb.fit(embeddings)
        return model.labels_

Основной процесс

In [None]:
# Класс для основного процесса
class MainProcessor:
    def __init__(self, api_key: str, folder_path: str):
        self.folder_path = folder_path
        self.text_processor = TextProcessor(api_key)
        self.dimension_reduction = DimentionReduction()
        self.clusters_reduction = ClustersReduction()

    def load_text_files(self) -> List[str]:
        text_files_content = []
        base_files = get_all_files(BASE_TRANSCRIBED_TEXT_PATH)
        for base_file in base_files:
            text_files_content.append(read_file_text(base_file['id'], UTF8_ENCODING))
        # for filename in os.listdir(self.folder_path):
        #     if filename.endswith('.txt'):
        #         file_path = os.path.join(self.folder_path, filename)
        #         try:
        #             with open(file_path, 'r', encoding='utf-8') as file:
        #                 content = file.read().strip()
        #                 if content:
        #                     text_files_content.append(content)
        #                 else:
        #                     print(f"Файл {filename} пуст и не будет обработан.")
        #         except Exception as e:
        #             print(f"Ошибка при чтении файла {filename}: {e}")

        print(f"Количество загруженных текстов: {len(text_files_content)}")
        return text_files_content


    async def process(self):
        text_data = self.load_text_files()

        if not text_data:
            print("Не найдено текстовых файлов в указанной папке.")
            return

        embeddings = await self.text_processor.get_text_embedding(text_data)

        if embeddings.size == 0 or embeddings.ndim != 2:
            print("Не удалось получить эмбеддинги или эмбеддинги имеют неправильную размерность.")
            return

        print(f"Количество полученных эмбеддингов: {embeddings.shape[0]}")

        reduced_embeddings = self.dimension_reduction.apply(embeddings)
        cluster_labels = self.clusters_reduction.apply(reduced_embeddings)

        print(f"Кластеры: {cluster_labels}")

        # Вывод кластеров и текстов
        cluster_results = {}
        for text, cluster in zip(text_data, cluster_labels):
            cluster_results.setdefault(cluster, []).append(text)

        print("Результаты кластеризации:")
        for cluster, texts in cluster_results.items():
            print(f"\nКластер {cluster}:")
            for text in texts:
                print(f"- {text[:100]}...")

Запуск процесса

In [None]:
async def main():
    api_key = API_KEY
    folder_path = BASE_TRANSCRIBED_TEXT_PATH[:-1]

    processor = MainProcessor(api_key, folder_path)
    await processor.process()

await main()

138 files in the folder
Количество загруженных текстов: 138
Количество полученных эмбеддингов: 138


  return 1.0 / (1.0 + a * x ** (2 * b))
  warn(


Кластеры: [ 6  4  2  6  0  4  5  0  8  4  6  3  4  1  6  6  1  0  1  3  7 -1  0  7
  4  6  8  3  4  4  3  1  4  6  4 -1  3  4  3  4  4  1  6  8  4  1  1  0
  4  4  0  1  1  4  4  2  3  6  4  1  2  3  4  0 -1  0  0 -1  3  4  1  1
  2  5  4 -1  5  4  0  1  4  4  4  6  4 -1  5  8  6  4  4  6  0  6  1  1
  6  4  6  6 -1  6  8  1  8  3  2  4 -1  4  6  2  5  6  4  4  8  4  0 -1
  4  8  0  1  3  8  8  6  6  4  4  1  1  8  4  4 -1  3]
Результаты кластеризации:

Кластер 6:
- ['  SPEAKER_00: ', '  SPEAKER_04:  Доброе утро, студия Комакс, не слышала.', '  SPEAKER_00:  Продолж...
- ['  SPEAKER_00:  Пусть я глубокий, здравствуйте.']...
- ['  SPEAKER_01:  Добрый вечер, студия Гоу Боксинг. Меня зовут Анна. Чем могу помочь?', '  SPEAKER_00...
- ['  SPEAKER_00:  Спасибо.', '  SPEAKER_01:  Администратор Гоу-Боксинг, добрый день, слушаю вас.', ' ...
- ['  SPEAKER_03: ', '  SPEAKER_02:  Здравствуйте, студия Гоу Боксинг, слушаю вас. Добрый вечер. Можно...
- ['  SPEAKER_02:  Добрый день, студия Глобоксинг, 



# Аналитика по темам групп

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import re

def clean_text(text):
    # Удаляем префиксы "1. ", "2. ", "3. "
    text = re.sub(r'^\d+\.\s+', '', text)
    # Удаляем все, что идет после "-transcri..."
    text = re.sub(r'-transcri.*', '', text)
    return text

df = df_input.copy()

df['Тема'] = df['Тема'].apply(clean_text)
df['Краткая суть'] = df['Краткая суть'].fillna('').apply(clean_text)
df['Очень краткая суть'] = df['Очень краткая суть'].fillna('').apply(clean_text)
df['dialogue'] = df['Тема']

model = SentenceTransformer('all-MiniLM-L6-v2')
X_embeddings = model.encode(df['dialogue'], show_progress_bar=True)

silhouette_scores = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(X_embeddings, labels)
    silhouette_scores.append(silhouette_avg)

plt.figure(figsize=(8, 4))
plt.plot(K, silhouette_scores, 'bx-')
plt.xlabel('Количество кластеров')
plt.ylabel('Средний силуэтный коэффициент')
plt.title('Силуэтный анализ для определения оптимального числа кластеров')
plt.show()

# Определение оптимального количества кластеров
optimal_k = K[silhouette_scores.index(max(silhouette_scores))]
print(f"Оптимальное количество кластеров: {optimal_k}")

# Кластеризация с использованием KMeans с оптимальным количеством кластеров
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X_embeddings)
df['initial_cluster'] = kmeans.labels_

# Используем кластеры как метки для классификации
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, df['initial_cluster'], test_size=0.3, random_state=42)

classifiers = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'MLPClassifier': MLPClassifier(max_iter=1000, random_state=42)
}

group_themes = {}
for group_num in set(df['final_group']):
    group_texts = df[df['final_group'] == group_num]['dialogue'].tolist()
    vectorizer_group = TfidfVectorizer()
    X_group = vectorizer_group.fit_transform(group_texts)
    feature_names = vectorizer_group.get_feature_names_out()
    tfidf_scores = X_group.sum(axis=0).A1
    sorted_items = sorted(zip(tfidf_scores, feature_names), reverse=True)

    top_words = [word for _, word in sorted_items[:3]]
    theme_name = " | ".join(top_words)
    group_themes[group_num] = theme_name

    top_words = sorted_items[:10]
    words, scores = zip(*top_words)

    plt.figure(figsize=(10, 5))
    sns.barplot(x=scores, y=words, palette="coolwarm")
    plt.title(f'Топ 10 ключевых слов для группы {group_num} ({group_themes[group_num]})')
    plt.xlabel('TF-IDF балл')
    plt.ylabel('Ключевые слова')
    plt.show()

    print(f"\nСодержимое группы {group_num} ({group_themes[group_num]}):")
    group_df = df[df['final_group'] == group_num]
    print(group_df[['dialogue']])

df['Тема группы'] = df['final_group'].map(group_themes)

plt.figure(figsize=(10, 6))
sns.barplot(x=df['Тема группы'].value_counts().index, y=df['Тема группы'].value_counts().values, palette="viridis")
plt.title('Распределение данных по финальным группам (с названиями тем)')
plt.xlabel('Название группы')
plt.ylabel('Количество диалогов')
plt.xticks(rotation=45)
plt.show()

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_embeddings)
plt.figure(figsize=(10, 7))
sns.scatterplot(x=principal_components[:, 0], y=principal_components[:, 1], hue=df['Тема группы'], palette="viridis", s=100, legend="full")
plt.title("Финальные группы данных с названиями тем")
plt.xlabel("Главная компонента 1")
plt.ylabel("Главная компонента 2")
plt.legend(title='Тема группы', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

print("\nАналитика по темам групп:")
for group_num, theme_name in group_themes.items():
    count_in_group = df[df['final_group'] == group_num].shape[0]
    print(f"Тема: {theme_name}")
    print(f"Количество диалогов: {count_in_group}")
    print(f"Ключевые слова: {', '.join([word for _, word in sorted_items[:10]])}")
    print('-' * 40)


NameError: name 'df_input' is not defined

In [None]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40
