In [1]:
# !pip install requirements.txt

### Train model

In [2]:
import warnings

from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers import PydanticOutputParser, RetryWithErrorOutputParser
from langchain.prompts import (
    ChatPromptTemplate, 
    FewShotChatMessagePromptTemplate,
    HumanMessagePromptTemplate,
    PromptTemplate,
    SemanticSimilarityExampleSelector,
    SystemMessagePromptTemplate
)
from langchain.vectorstores import Chroma
from pydantic import BaseModel, Field, model_validator

In [3]:
from constants import OPENAI_API_KEY

In [4]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0.0)

In [5]:
class Sentence(BaseModel):
    incorrect: str = Field(description="sentence with grammatical and punctuation errors")
    clean: str = Field(description="sentence without grammatical and punctuation errors")
    
    @model_validator(mode='before')
    def clean_sentence_should_differ_from_incorrect_sentence(cls, data):
        if data['clean'] == data['incorrect']:
            warnings.warn(f"Not fixed the issue in sentence {data['incorrect']}!")
        return data

parser = PydanticOutputParser(pydantic_object=Sentence)

retry_parser = RetryWithErrorOutputParser.from_llm(
    parser=parser, llm=llm
)

In [6]:
import re

def read_source(source):
    with open(f"./data/gec-only/{source}.txt") as f:
        return list(filter(None, re.sub('# \d\d\d\d', '', f.read()).split('\n')))

In [7]:
incorrect, clean = (
    read_source(source="train.src"),
    read_source(source="train.tgt")
)

In [8]:
import random

def train_validation_split(incorrect, clean, ratio = 0.2, subset = 1.0):
    size = int(subset * len(incorrect))
    indices = set(range(size))
    train_indices = set(random.sample(list(indices), k=int((1 - ratio) * size)))
    valid_indices = indices.difference(train_indices)
    train_incorrect, train_clean = [incorrect[i] for i in train_indices], [clean[i] for i in train_indices]
    valid_incorrect, valid_clean = [incorrect[i] for i in valid_indices], [clean[i] for i in valid_indices]
    return train_incorrect, train_clean, valid_incorrect, valid_clean

In [9]:
train_incorrect, train_clean, valid_incorrect, valid_clean = train_validation_split(incorrect, clean, subset=0.0003)

In [10]:
examples = [
    {
        "input": incorrect_sentence,
        "output": clean_sentence
    }
    for incorrect_sentence, clean_sentence in zip(valid_incorrect, valid_clean)
]

to_vectorize = [" ".join(example.values()) for example in examples]

example_selector = SemanticSimilarityExampleSelector(
    vectorstore=Chroma.from_texts(
        to_vectorize, 
        OpenAIEmbeddings(), 
        metadatas=examples
    ),
    k=2,
)

In [11]:
system_prompt = SystemMessagePromptTemplate.from_template("You are a helpful assistant that makes grammatical and punctuation error correction in Ukrainian sentences.")

In [12]:
few_shot_prompt = FewShotChatMessagePromptTemplate(
    input_variables=["input"],
    example_selector=example_selector,
    example_prompt=ChatPromptTemplate.from_messages(
        [("human", "{input}"), ("ai", "{output}")]
    )
)

In [13]:
human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template="{input} \n {format_instructions}",
        input_variables=["input"],
        partial_variables={"format_instructions": retry_parser.get_format_instructions()},
    )
)

In [14]:
prompt = ChatPromptTemplate.from_messages(
    [
        system_prompt,
        few_shot_prompt,
        human_prompt
    ]
)
prompt.format_prompt(input="Я не знаю, чи це, правильне речення").to_string()

'System: You are a helpful assistant that makes grammatical and punctuation error correction in Ukrainian sentences.\nHuman: Моє бачення Instagram\nAI: Моє бачення Instagram\nHuman: Якщо цікаво подивитися відразу на результат, то щиро прошу за цим посиланням – https://www.instagram.com/yevhenii_kanivets/\nAI: Якщо цікаво подивитися відразу на результат, то щиро прошу за цим посиланням — https://www.instagram.com/yevhenii_kanivets/\nHuman: Я не знаю, чи це, правильне речення \n The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"incorrect": {"description": "sentence with 

In [15]:
parser.parse(llm(prompt.format_prompt(input="Я не знаю, чи це, правильне речення").to_string()))

Sentence(incorrect='Я не знаю, чи це, правильне речення', clean='Я не знаю, чи це правильне речення')

### Evaluate model

In [16]:
from langchain.evaluation import ExactMatchStringEvaluator

evaluator = ExactMatchStringEvaluator(
    ignore_case=False,
    ignore_numbers=True,
    ignore_punctuation=False,
)

In [17]:
def evaluate(valid_incorrect, valid_clean):
    correct = 0
    for input, reference in zip(valid_incorrect, valid_clean):
        prompt_value = prompt.format_prompt(input=input)
        completion = llm(prompt_value.to_string())
        sentence = retry_parser.parse_with_prompt(completion, prompt_value)
        score = evaluator.evaluate_strings(prediction=sentence.clean, reference=reference).get("score")
        correct += score
    accuracy = correct / len(valid_clean)
    return accuracy

In [18]:
print(f"Accuracy: {evaluate(valid_incorrect, valid_clean) * 100}%")

Accuracy: 100.0%




### Deploy

In [19]:
test_incorrect, test_clean = (
    read_source(source="valid.src"),
    read_source(source="valid.tgt")
)

In [20]:
# TODO: Run llm on test_incorrect and save them in "valid.tgt" file