Skip to content

Commit

Permalink
Add experimental sentence learning
Browse files Browse the repository at this point in the history
  • Loading branch information
enzet committed Aug 1, 2023
1 parent 207ae1d commit d18d0b2
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 6 deletions.
8 changes: 8 additions & 0 deletions emmio/learn/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Iterator

from emmio.language import Language
from emmio.learn.config import LearnConfig
from emmio.learn.core import Learning

Expand Down Expand Up @@ -48,3 +49,10 @@ def compute_pressure(self):
See ``Learning.compute_pressure``.
"""
return sum(x.compute_pressure() for x in self.learnings.values())

def get_learnings_by_language(self, language: Language) -> list[Learning]:
return [
learning
for learning in self.learnings.values()
if learning.learning_language == language
]
7 changes: 5 additions & 2 deletions emmio/sentence/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
__author__ = "Sergey Vartanov"
__email__ = "me@enzet.ru"

from emmio.user.data import UserData


@dataclass
class Sentence:
Expand Down Expand Up @@ -62,8 +64,9 @@ def filter_by_word(
"""
raise NotImplementedError()

def __len__(self) -> int:
return 0
def get_most_known(self, user_data: UserData) -> list[SentenceTranslations]:
"""Get top most known sentences."""
raise NotImplementedError()


@dataclass
Expand Down
63 changes: 59 additions & 4 deletions emmio/sentence/tatoeba.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import json
import logging
import os
from dataclasses import dataclass
from dataclasses import dataclass, field
from os.path import join
from pathlib import Path

from emmio.language import Language
from emmio.sentence.core import Sentence, SentenceTranslations, Sentences
from emmio.sentence.database import SentenceDatabase
from emmio.ui import progress_bar
from emmio.user.data import UserData
from emmio.util import download

__author__ = "Sergey Vartanov"
Expand All @@ -25,6 +26,12 @@ class TatoebaSentences(Sentences):
language_2: Language
database: SentenceDatabase

links: dict[int, set[int]] = field(default_factory=dict)
"""
Mapping form sentence identifiers in language 2 to sets of sentence
identifiers in language 1.
"""

def __post_init__(self):
links_cache_path: Path = (
self.path / "cache" / f"links_{self.language_1.get_part3()}_"
Expand Down Expand Up @@ -165,7 +172,10 @@ def fill_cache(self, file_name: str) -> None:
logging.info("Writing word cache...")
json.dump(self.cache, output_file)

def filter_(
def __len__(self):
raise NotImplementedError()

def filter_by_word(
self,
word: str,
ids_to_skip: set[int],
Expand Down Expand Up @@ -215,5 +225,50 @@ def filter_(
)
return result

def __len__(self):
raise NotImplementedError()
def get_most_known(self, user_data: UserData) -> list[SentenceTranslations]:
sentences: dict[str, Sentence] = self.database.get_sentences(
self.language_2, self.path / "cache"
)
rates: list[tuple[str, float]] = []
for sentence_id, sentence in sentences.items():
words: list[str] = sentence.text.split(" ")
rate: float = 0
for word in words:
word = word.lower()
if (
word.endswith(".")
or word.endswith("?")
or word.endswith("։")
):
word = word[:-1]
if user_data.is_known(word, self.language_2):
rate += 1
else:
rate += 0
if rate / len(words) > 0.9 and len(words) > 1:
r = (rate + 1) / len(words)
rates.append((sentence_id, r))

for sentence_id, rate in sorted(rates, key=lambda x: -x[1]):
text = self.database.get_sentence(self.language_2, sentence_id).text
hidden: str = ""
for c in text:
if c not in " .?,":
hidden += "*"
else:
hidden += c
if str(sentence_id) in self.links and self.links[str(sentence_id)]:
for sentence_id_2 in self.links[str(sentence_id)]:
print(
" ",
self.database.get_sentence(
self.language_1, sentence_id_2
).text,
)
print(f"{rate:.2f} {hidden}")
a = input()
while a != text:
if a == "":
print(text)
break
a = input()

0 comments on commit d18d0b2

Please sign in to comment.