In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModel, Trainer, TrainingArguments, BertForMaskedLM, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM
from transformers import AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm, trange
import numpy as np
from scipy.spatial.distance import cosine
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GenerationConfig
import os
import json
from peft import PeftModel, PeftConfig
from langchain_core.embeddings import Embeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import pandas as pd
from pathlib import Path

In [3]:
!nvidia-smi

Sun Oct 27 06:20:38 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM3-32GB           Off |   00000000:05:00.0 Off |                    0 |
| N/A   35C    P0             62W /  350W |       1MiB /  32768MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

–ü–æ–¥–∫–ª—é—á–µ–Ω–∏–µ –∫ –±–∞–∑–µ –¥–∞–Ω–Ω—ã—Ö:

In [5]:
DATASET_DIRECTORY_PATH: Path = Path("test_data")
CONVERTED_DATASET_DIRECTORY_PATH: Path = Path("test_data_working")
DATASET_TASKS_FILE_PATH: Path = Path("test.csv")

QDRANT_URL = "http://213.171.5.51:6333"
SIZE = 1024

MODEL_DIRECTORY = "models2"

FULL_COLLECTION_NAME = "GLOBAL"

In [6]:
from converter import convert_dataset

dataset_documents_files_paths = convert_dataset(DATASET_DIRECTORY_PATH,
    output_directory_path=CONVERTED_DATASET_DIRECTORY_PATH,
    converting_suffixes_list=[])

copying –¢–æ–º 2 –ü–î–í –≠–∫–æ –ê–≥—Ä–æ.docx
copying –¢–æ–º 1 –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ.docx


In [7]:
from text import get_text_blocks

dataset_documents_texts_blocks = dict[str, list[str]]()
dataset_documents_texts_blocks[FULL_COLLECTION_NAME] = []

for dataset_document_file_path in dataset_documents_files_paths:
    dataset_document_text_blocks = get_text_blocks(dataset_document_file_path)

    dataset_document_file_name = dataset_document_file_path.stem
    dataset_documents_texts_blocks[dataset_document_file_name] = \
        dataset_document_text_blocks
    
    dataset_documents_texts_blocks[FULL_COLLECTION_NAME] += \
        dataset_document_text_blocks

In [8]:
class CustomEmbedding(Embeddings):
    def __init__(self, directory: str = "models2"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = BertForMaskedLM.from_pretrained(directory)
        self.tokenizer = AutoTokenizer.from_pretrained(directory)
        self.model.to(self.device)

    def embed_query(self, text: str) -> list[float]:
        return extract_features(text, self.model, self.tokenizer).tolist()

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        return list(map(lambda text: self.embed_query(text), texts))


def extract_features(text, model, tokenizer):
    model.eval()

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=512,
        chunk_overlap=64
    )

    chunks = text_splitter.split_text(text)

    features = []

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]

        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
        sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
        mean_embeddings = sum_embeddings / torch.clamp(mask_expanded.sum(1), min=1e-9)

        features.append(mean_embeddings)

    return torch.stack(features).mean(dim=0).squeeze(0).cpu()

embedding = CustomEmbedding(MODEL_DIRECTORY)

# –ø–æ–ª—É—á–µ–Ω–∏–µ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞ –æ –∑–∞–ø—Ä–æ—Å–µ
def get_context(question: str, qdrant_collection_name: str, k: int = 15) -> list[str]:
    # –ø—Ä–æ—Å—Ç–æ –ø–æ–¥–∫–ª—é—á–∞–µ–º—Å—è –∫ –±–∞–∑–µ –¥–∞–Ω–Ω—ã—Ö –æ—Ç–≤–µ—Ç–æ–≤
    client = QdrantClient(url=QDRANT_URL)

    if not client.collection_exists(qdrant_collection_name):
        client.create_collection(qdrant_collection_name, vectors_config=VectorParams(size=SIZE, distance=Distance.COSINE))

    embedding = CustomEmbedding()

    qdrant = QdrantVectorStore(
        client=client,
        collection_name=qdrant_collection_name,
        embedding=embedding,
    )

    # –¥–æ—Å—Ç–∞–µ–º k –±–ª–∏–∂–∞–π—à–∏—Ö –ø–æ —Å–º—ã—Å–ª—É —á–∞—Å—Ç–µ–π –∏ –¥–æ–±–∞–≤–ª—è–µ–º –≤ –∫–æ–Ω—Ç–µ–∫—Å—Ç –Ω–∞ –¥–∞–ª—å–Ω–µ–π—à—É—é –æ–±—Ä–∞–±–æ—Ç–∫—É
    return [i.page_content for i in qdrant.similarity_search(question, k=k)]


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [9]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore

qdrant_client = QdrantClient(url=QDRANT_URL)

for collection_name, collection_text_blocks in dataset_documents_texts_blocks.items():
    if not qdrant_client.collection_exists(collection_name):
        print(f"Creating collection \"{collection_name}\"")
        qdrant_client.create_collection(collection_name,
            vectors_config=VectorParams(size=SIZE, distance=Distance.COSINE))
    
        collection_qdrant_vector_store = QdrantVectorStore(
            client=qdrant_client,
            collection_name=collection_name,
            embedding=embedding
        )

        print(f"Uploading text blocks into collection \"{collection_name}\"")
        collection_qdrant_vector_store.add_texts(collection_text_blocks)


In [17]:
LLM_MODEL_NAME = "IlyaGusev/saiga_7b_lora"
DEFAULT_MESSAGE_TEMPLATE = "<s>{role}\n{content}</s>"
DEFAULT_RESPONSE_TEMPLATE = "<s>bot\n"
DEFAULT_SYSTEM_PROMPT = "–¢—ã ‚Äî –°–∞–π–≥–∞, —Ä—É—Å—Å–∫–æ—è–∑—ã—á–Ω—ã–π –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏–π –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç. –¢—ã —Ä–∞–∑–≥–æ–≤–∞—Ä–∏–≤–∞–µ—à—å —Å –ª—é–¥—å–º–∏ –∏ –ø–æ–º–æ–≥–∞–µ—à—å –∏–º."


def get_prompt(context: list[str], question: str) -> str:
    prompt = (f"–ù–µ –ø–æ–≤—Ç–æ—Ä—è–π—Å—è. –¢–æ–ª—å–∫–æ —Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫. –ù–µ –∑–∞–¥–∞–≤–∞–π –≤–æ–ø—Ä–æ—Å–æ–≤. –ö–æ–Ω—Ç–µ–∫—Å—Ç: {context}, –í–æ–ø—Ä–æ—Å: {question}, –û—Ç–≤–µ—Ç:")
    n = len(context)

    while len(prompt) >= 2048:
        prompt = (f"–ù–µ –ø–æ–≤—Ç–æ—Ä—è–π—Å—è. –¢–æ–ª—å–∫–æ —Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫. –ù–µ –∑–∞–¥–∞–≤–∞–π –≤–æ–ø—Ä–æ—Å–æ–≤. –ö–æ–Ω—Ç–µ–∫—Å—Ç: {context[:n - 1]}, –í–æ–ø—Ä–æ—Å: {question}, –û—Ç–≤–µ—Ç:")
        n -= 1

    return prompt


class Conversation:
    def __init__(
            self,
            message_template=DEFAULT_MESSAGE_TEMPLATE,
            system_prompt=DEFAULT_SYSTEM_PROMPT,
            response_template=DEFAULT_RESPONSE_TEMPLATE
    ):
        self.message_template = message_template
        self.response_template = response_template
        self.messages = [{
            "role": "system",
            "content": system_prompt
        }]

    def add_user_message(self, message):
        self.messages.append({
            "role": "user",
            "content": message
        })

    def add_bot_message(self, message):
        self.messages.append({
            "role": "bot",
            "content": message
        })

    def get_prompt(self, tokenizer):
        final_text = ""
        for message in self.messages:
            message_text = self.message_template.format(**message)
            final_text += message_text
        final_text += DEFAULT_RESPONSE_TEMPLATE
        return final_text.strip()


def generate(model, tokenizer, prompt, generation_config):
    data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    data = {k: v.to(model.device) for k, v in data.items()}
    output_ids = model.generate(
        **data,
        generation_config=generation_config
    )[0]
    output_ids = output_ids[len(data["input_ids"][0]):]
    output = tokenizer.decode(output_ids, skip_special_tokens=True)
    return output.strip()


config = PeftConfig.from_pretrained(LLM_MODEL_NAME)
llm_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device
)
llm_model = PeftModel.from_pretrained(
    llm_model,
    LLM_MODEL_NAME,
    torch_dtype=torch.float16
)
llm_model.eval()

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, use_fast=False)
generation_config = GenerationConfig.from_pretrained(LLM_MODEL_NAME)
generation_config.temperature = 0.01
# generation_config.frequency_penalty = 1.3


# —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ –≤—ã–∑–æ–≤ —Ñ—É–Ω–∫—Ü–∏–∏ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –æ—Ç–≤–µ—Ç–∞ –≤ –õ–õ–ú –°–∞–π–≥–∞. –ü–æ–¥–∞–µ—Ç—Å—è –Ω–∞ –≤—Ö–æ–¥ –∑–∞–ø—Ä–æ—Å –∏ –∫–æ–Ω—Ç–µ–∫—Å—Ç
def get_answer(question: str, context: list[str]):
    prompt = get_prompt(context, question)

    return generate(llm_model, llm_tokenizer, prompt, generation_config)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# —Å–æ–±—Ä–∞–Ω–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏
def pipeline(question: str, collection_name: str = "FULL_UPLOAD") -> str:
    context = get_context(question, collection_name)

    return get_answer(question, ",".join(context))

In [12]:
pipeline("–ù–∞ –æ—Å–Ω–æ–≤–∞–Ω–∏–∏ –∫–∞–∫–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞ (–Ω–µ –ù–ü–ê) —Ä–∞–∑—Ä–∞–±–∞—Ç—ã–≤–∞—é—Ç—Å—è –∏ —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—é—Ç—Å—è –Ω–æ—Ä–º–∞—Ç–∏–≤—ã –¥–æ–ø—É—Å—Ç–∏–º—ã—Ö –≤—ã–±—Ä–æ—Å–æ–≤ –∑–∞–≥—Ä—è–∑–Ω—è—é—â–∏—Ö –≤–µ—â–µ—Å—Ç–≤ –≤ –∞—Ç–º–æ—Å—Ñ–µ—Ä–Ω—ã–π –≤–æ–∑–¥—É—Ö?")

'–ù–æ—Ä–º–∞—Ç–∏–≤—ã –¥–æ–ø—É—Å—Ç–∏–º—ã—Ö –≤—ã–±—Ä–æ—Å–æ–≤ –∑–∞–≥—Ä—è–∑–Ω–∏—Ç–µ–ª–µ–π –≤ –∞—Ç–º–æ—Å—Ñ–µ—Ä–Ω—ã–π –≤–æ–∑–¥—É—Ö —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—é—Ç—Å—è –Ω–∞ –æ—Å–Ω–æ–≤–µ –ü–æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏–π –ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–∞ –†–æ—Å—Å–∏–π—Å–∫–æ–π –§–µ–¥–µ—Ä–∞—Ü–∏–∏ –æ—Ç 19.08.2004 ‚Ññ 653 ¬´–û–± —É—Ç–≤–µ—Ä–∂–¥–µ–Ω–∏–∏ –ü—Ä–∞–≤–∏–ª —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–≥–æ —Ä–µ–≥—É–ª–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ –∫–æ–Ω—Ç—Ä–æ–ª—é –∑–∞ –≤—ã–±—Ä–æ—Å–∞–º–∏ –∑–∞–≥—Ä—è–∑–Ω—è—é—â–∏—Ö –≤–µ—â–µ—Å—Ç–≤ –≤ –∞—Ç–º–æ—Å—Ñ–µ—Ä—É¬ª, –∞ —Ç–∞–∫–∂–µ –¥—Ä—É–≥–∏—Ö –Ω–æ—Ä–º–∞—Ç–∏–≤–Ω—ã—Ö –ø—Ä–∞–≤–æ–≤—ã—Ö –∞–∫—Ç–æ–≤.'

## –ß—Ç–µ–Ω–∏–µ –∏–∑ —Ñ–∞–π–ª–æ–≤ –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ csv

In [13]:
test_df = pd.read_csv("test.csv", sep="\t")

In [14]:
test_df

Unnamed: 0,‚Ññ –ø/–ø,–í–æ–ø—Ä–æ—Å,–û—Ç–≤–µ—Ç,–î–æ–∫—É–º–µ–Ω—Ç
0,1,"–û–±—ä—è—Å–Ω–∏—Ç—å, —á—Ç–æ —Ç–∞–∫–æ–µ –∏—Å—Ç–æ—á–Ω–∏–∫ –≤—ã–±—Ä–æ—Å–æ–≤, –∏—Å—Ç–æ—á–Ω...",,–ù–µ—Ç
1,2,–£–∫–∞–∑–∞—Ç—å —ç—Ç–∞–ø—ã —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –ø—Ä–æ–µ–∫—Ç–∞ –Ω–∞—á–∏–Ω–∞—è —Å –ø–æ–ª...,,–ù–µ—Ç
2,3,–†–∞—Å–ø–∏—Å–∞—Ç—å —Å–æ—Å—Ç–∞–≤ —Ç–æ–º–∞ –ü–î–í,,–ù–µ—Ç
3,4,–ö–∞–∫ –ø—Ä–∏—Å–≤–∞–∏–≤–∞—é—Ç—Å—è –Ω–æ–º–µ—Ä–∞ –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤ –≤—ã–±—Ä–æ—Å–æ–≤ –ø...,,–ù–µ—Ç
4,5,–ß—Ç–æ —Ç–∞–∫–æ–µ –≥–∞–∑–æ–æ—á–∏—Å—Ç–Ω—ã–µ —É—Å—Ç–∞–Ω–æ–≤–∫–∏? –ü—Ä–∏–≤–µ–¥–∏—Ç–µ –∏—Ö...,,–ù–µ—Ç
...,...,...,...,...
87,88,"–ö–∞–∫–æ–≤—ã –∏—Å—Ç–æ—á–Ω–∏–∫–∏ –≤—ã–±—Ä–æ—Å–æ–≤, –∏–º–µ—é—â–∏–µ –ø—Ä–æ–∏–∑–≤–æ–ª—å–Ω—É...",,"–ö–Ω–∏–≥–∞ 1 - –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ, –¢–∞–±–ª–∏—Ü–∞ 4.4"
88,89,–°–∏–ª—å–Ω–µ–µ –ª–∏ –∂–∏–¥–∫–∏–µ –∏ –≥–∞–∑–æ–æ–±—Ä–∞–∑–Ω—ã–µ –∑–∞–≥—Ä—è–∑–Ω—è—é—â–∏–µ ...,,"–ö–Ω–∏–≥–∞ 1 - –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ, –¢–∞–±–ª–∏—Ü–∞ 4.8"
89,90,–ö–∞–∫–æ–π –≥–æ–¥–æ–≤–æ–π –≤—ã–±—Ä–æ—Å –≤ —Ç–æ–Ω–Ω–∞—Ö –∑–µ—Ä–Ω–æ–≤–æ–π –ø—ã–ª–∏?\n,,"–ö–Ω–∏–≥–∞ 1 - –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ, –¢–∞–±–ª–∏—Ü–∞ 1.1.2"
90,91,–°–æ–∑–¥–∞—ë—Ç—Å—è –ª–∏ —Ö–ª–æ–ø–∫–æ–≤–∞—è –ø—ã–ª—å?\n,,"–ö–Ω–∏–≥–∞ 1 - –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ, –¢–∞–±–ª–∏—Ü–∞ 1.1.2"


In [20]:
from csv import DictReader as CsvDictReader

answers = []

with open(DATASET_TASKS_FILE_PATH, mode="r") as dataset_tasks_file:
    dataset_tasks_file_reader = CsvDictReader(dataset_tasks_file, delimiter="\t")
    for task in dataset_tasks_file_reader:
        print(task)
        question = task["–í–æ–ø—Ä–æ—Å"]
        document = task["–î–æ–∫—É–º–µ–Ω—Ç"][:7]
        
        if document == "–ö–Ω–∏–≥–∞ 1":
            collection_name = "–¢–æ–º 1 –ò–Ω–≤–µ–Ω—Ç–∞—Ä–∏–∑–∞—Ü–∏—è –≠–∫–æ –ê–≥—Ä–æ"
        elif document == "–ö–Ω–∏–≥–∞ 2":
            collection_name = "–¢–æ–º 2 –ü–î–í –≠–∫–æ –ê–≥—Ä–æ"
        else:
            collection_name = "FULL_UPLOAD"
    
        context = get_context(question, collection_name)
        answer = get_answer(question, ",".join(context))
    
        print(answer)
        print("--------------------------")
        answers.append(answer)

{'‚Ññ –ø/–ø': '1', '–í–æ–ø—Ä–æ—Å': '–û–±—ä—è—Å–Ω–∏—Ç—å, —á—Ç–æ —Ç–∞–∫–æ–µ –∏—Å—Ç–æ—á–Ω–∏–∫ –≤—ã–±—Ä–æ—Å–æ–≤, –∏—Å—Ç–æ—á–Ω–∏–∫ –≤—ã–¥–µ–ª–µ–Ω–∏—è.', '–û—Ç–≤–µ—Ç': '', '–î–æ–∫—É–º–µ–Ω—Ç': '–ù–µ—Ç'}
–ò—Å—Ç–æ—á–Ω–∏–∫ –≤—ã–±—Ä–æ—Å–æ–≤ ‚Äì —ç—Ç–æ –º–µ—Å—Ç–æ, –≥–¥–µ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤—ã–¥–µ–ª–µ–Ω–∏–µ –≤—Ä–µ–¥–Ω—ã—Ö –≤–µ—â–µ—Å—Ç–≤ –≤ –∞—Ç–º–æ—Å—Ñ–µ—Ä—É. –ò—Å—Ç–æ—á–Ω–∏–∫–æ–º –≤—ã–¥–µ–ª–µ–Ω–∏—è —è–≤–ª—è—é—Ç—Å—è —Ä–∞–∑–ª–∏—á–Ω—ã–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–µ –ø—Ä–æ—Ü–µ—Å—Å—ã, —Ç–∞–∫–∏–µ –∫–∞–∫ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ —ç–ª–µ–∫—Ç—Ä–æ—ç–Ω–µ—Ä–≥–∏–∏, —Ö–∏–º–∏—á–µ—Å–∫–∏–µ –∑–∞–≤–æ–¥—ã, –∞–≤—Ç–æ–º–æ–±–∏–ª—å–Ω—ã–µ –∑–∞–≤–æ–¥—ã –∏ —Ç.–¥.
--------------------------
{'‚Ññ –ø/–ø': '2', '–í–æ–ø—Ä–æ—Å': '–£–∫–∞–∑–∞—Ç—å —ç—Ç–∞–ø—ã —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –ø—Ä–æ–µ–∫—Ç–∞ –Ω–∞—á–∏–Ω–∞—è —Å –ø–æ–ª—É—á–µ–Ω–∏—è –≤ —Ä–∞–±–æ—Ç—É.', '–û—Ç–≤–µ—Ç': '', '–î–æ–∫—É–º–µ–Ω—Ç': '–ù–µ—Ç'}
1. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–µ–∫—Ç–∞. 2. –†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏—Ö —Ç—Ä–µ–±–æ–≤–∞–Ω–∏–π. 3. –í—ã–±–æ—Ä —Ç–µ—Ö–Ω–æ–ª–æ–≥

In [19]:
ans_copy = answers

In [21]:
len(answers)

92

In [22]:
ans_df = pd.DataFrame(range(1,93)
ans_df["answer"] = pd.DataFrame(answers)
ans_df.columns = []
‚Ññ –ø/–ø

Unnamed: 0,0
0,"–ò—Å—Ç–æ—á–Ω–∏–∫ –≤—ã–±—Ä–æ—Å–æ–≤ ‚Äì —ç—Ç–æ –º–µ—Å—Ç–æ, –≥–¥–µ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç ..."
1,1. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–µ–∫—Ç–∞. 2. –†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏—Ö...
2,–°–æ—Å—Ç–∞–≤ —Ç–æ–º–∞ –ü–î–í –≤–∫–ª—é—á–∞–µ—Ç –≤ —Å–µ–±—è —Å–ª–µ–¥—É—é—â–∏–µ –¥–æ–∫—É...
3,–ù–æ–º–µ—Ä–∞ –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤ –≤—ã–±—Ä–æ—Å–æ–≤ –ø—Ä–∏—Å–≤–∞–∏–≤–∞—é—Ç—Å—è –≤ —Å–æ–æ...
4,–ì–∞–∑–æ–æ—á–∏—Å—Ç–Ω—ã–µ —É—Å—Ç–∞–Ω–æ–≤–∫–∏ - —ç—Ç–æ —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ –º–µ—Ö–∞–Ω...
...,...
87,–ò–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ —Ä–∞—Å—á–µ—Ç–∞ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–≥–æ —Ä–∞–∑–æ–≤–æ–≥–æ ...
88,–ñ–∏–¥–∫–∏–µ –∏ –≥–∞–∑–æ–æ–±—Ä–∞–∑–Ω—ã–µ –∑–∞–≥—Ä—è–∑–Ω–∏—Ç–µ–ª–∏ –º–æ–≥—É—Ç –±—ã—Ç—å ...
89,–í –≥–æ–¥–æ–≤–æ–π –≤—ã–±–æ—Ä –º–æ–∂–Ω–æ –±—ã–ª–æ –±—ã —É–∫–∞–∑–∞—Ç—å —Ç–æ–ª—å–∫–æ –æ...
90,"–ù–µ—Ç, —Å–æ–∑–¥–∞–µ—Ç—Å—è –Ω–µ —Ö–ª–æ–ø–∫–æ–≤–∞—è –ø—ã–ª—å, –∞ –æ–±—ã—á–Ω–∞—è –ø—ã–ª—å."


In [None]:
ans_df