## Extracting terms, their translations and definitions

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd '/content/drive/My Drive/HSE/projects/thesis'

/content/drive/My Drive/HSE/projects/thesis


### Extracting terms and their definitions from the Linguistic Encyclopedic Dictionary

In [3]:
import requests
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
from urllib.parse import urljoin
from tqdm import tqdm
import re
import json

In [None]:
def clean_text(text):
    """
    Replaces line breaks with spaces, removes \xa0 and \xad characters.
    """
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ').replace('\xad', '')
    return text

def post_process_text(text):
    """
    Removes spaces before punctuation marks.
    """
    text = re.sub(r'\s+([,.:;!?])', r'\1', text)
    return text

def extract_text_with_spaces(element):
    """
    Extracts and clears text from HTML, preserving the spaces before the text in <a> tags.
    """
    text_parts = []
    for child in element.children:
        if isinstance(child, NavigableString):
            cleaned_text = clean_text(child)
            text_parts.append(cleaned_text)
        elif child.name == 'a':
            link_text = clean_text(child.get_text(strip=True))
            text_parts.append(link_text)
        else:
            # Recursively process other elements, if any
            text_parts.append(extract_text_with_spaces(child))
    return ''.join(text_parts)

def extract_text_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p', class_=lambda x: x not in ['podpisj', 'navigation'])
    text = ' '.join(extract_text_with_spaces(p) for p in paragraphs)
    return text

def extract_terms_and_texts(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    terms_texts = []
    h3 = soup.find('h3', text="Алфавитная словарная часть")
    if h3:
        clist_div = h3.find_next_sibling('div', class_='clist')
        if clist_div:
            for a_tag in tqdm(clist_div.find_all('a')):
                term = a_tag.get_text(strip=True)
                link = urljoin(base_url, a_tag['href'])  # Convert relative URL to absolute
                text = extract_text_from_page(link)  # Extract text from a page
                terms_texts.append((term, link, text))
    return terms_texts

In [None]:
extract_text_from_page('https://tapemark.narod.ru/les/009a.html')

'один из абхазско-адыгских языков. Распространён главным образом в Карачаево-Черкесской АО. Число говорящих свыше 27 тыс. чел. (1979, перепись). Имеет 2 диалекта: тапантский (лежит в основе литературного языка) и ашхарский (по фонетической системе и грамматическому строю близок к абхазскому языку). Отличается от абхазского языка фонетикой (переход дв, тв, тӀв в джв, чв, чӀв, сохранение спирантов гӀ, гӀв и смычных хъ, хъв и др.), образованием указательных местоимений, порядковых и кратных (кратностных) числительных, употреблением послелогов, формами времён и наклонений, отрицательными формами глагола, образованием деепричастий, наречий и т. д. Для А. я. характерны многочисленные заимствования из кабардино-черкесского языка. Письменность создана в 1932—33 на основе латинской графики, с 1938 переведена на русскую графическую основу. Литературный язык после Октябрьской революции 1917 получил интенсивное развитие.'

In [None]:
base_url = 'https://tapemark.narod.ru/les/'
terms_texts = extract_terms_and_texts(base_url)
for term, link, text in terms_texts[:5]:  # Display the first 5 terms and texts for example
    print(f"Термин: {term}, Ссылка: {link}\nТекст: {text[:100]}...")

  h3 = soup.find('h3', text="Алфавитная словарная часть")
100%|██████████| 1399/1399 [15:57<00:00,  1.46it/s]

Термин: Абазинский язык, Ссылка: https://tapemark.narod.ru/les/009a.html
Текст: один из абхазско-адыгских языков. Распространён главным образом в Карачаево-Черкесской АО. Число гов...
Термин: Аббревиатура, Ссылка: https://tapemark.narod.ru/les/009b.html
Текст: (итал. abbreviatura, от лат. abbrevio — сокращаю) — существительное, состоящее из усечённых слов, вх...
Термин: Аблаут, Ссылка: https://tapemark.narod.ru/les/009c.html
Текст: (нем. Ablaut) (апофония) — разновидность чередования гласных, фонетически не обусловленного и выража...
Термин: Абсолютная конструкция, Ссылка: https://tapemark.narod.ru/les/010a.html
Текст: см. Эргативный строй....
Термин: Абхазский язык, Ссылка: https://tapemark.narod.ru/les/010b.html
Текст: один из абхазско-адыгских языков. Распространён преимущественно в Абхазской АССР, за пределами СССР ...





### Extracting translations and definitions from the Wikipedia API

In [None]:
term = 'Абазинский язык'

search_url = f"https://ru.wikipedia.org/w/api.php"
params = {
    'action': 'query',
    'list': 'search',
    'srsearch': term,
    'format': 'json'
}
response = requests.get(search_url, params=params)
results = response.json().get('query', {}).get('search', [])
if results:
    print(results[0])
    print(type(results[0]))

{'ns': 0, 'title': 'Абазинский язык', 'pageid': 14994, 'size': 24105, 'wordcount': 983, 'snippet': '<span class="searchmatch">Абази́нский</span> <span class="searchmatch">язы́к</span> (абаза бызшва) — <span class="searchmatch">язык</span> абазин, относится к абхазо-<span class="searchmatch">абазинской</span> ветви абхазо-адыгской группы <span class="searchmatch">языков</span> Кавказа. Распространён в Карачаево-Черкесии', 'timestamp': '2023-12-20T22:33:47Z'}
<class 'dict'>


In [5]:
def search_wikipedia(term, lang='ru'):
    search_url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'search',
        'srsearch': term,
        'format': 'json'
    }
    response = requests.get(search_url, params=params)
    results = response.json().get('query', {}).get('search', [])
    if results:
        return results[0]['title']
    else:
        return None

def get_interlanguage_link(title, from_lang='ru', to_lang='en'):
    url = f"https://{from_lang}.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'titles': title,
        'prop': 'langlinks',
        'lllang': to_lang,
        'format': 'json'
    }
    response = requests.get(url, params=params)
    pages = response.json().get('query', {}).get('pages', {})
    for page_id, page in pages.items():
        langlinks = page.get('langlinks', [])
        if langlinks:
            return langlinks[0]['*']
    return None

def get_wikipedia_summary(title, lang='ru'):
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
    response = requests.get(url, params=params)
    pages = response.json().get('query', {}).get('pages', {})
    for page_id, page in pages.items():
        return page.get('extract', None)
    return None

### Data collection

In [None]:
url = 'https://tapemark.narod.ru/les/'
data = []

for term, link, ru_meaning in tqdm(terms_texts):
    ru_title = search_wikipedia(term, 'ru')
    en_title = get_interlanguage_link(ru_title, 'ru', 'en') if ru_title else None
    ru_summary = get_wikipedia_summary(ru_title, 'ru') if ru_title else None
    en_summary = get_wikipedia_summary(en_title, 'en') if en_title else None
    data.append({
        'Термин': term,
        'Определение (RU)': ru_meaning,
        'Перевод (EN)': en_title,
        'Отрывок из Википедии (RU)': ru_summary,
        'Отрывок из Википедии (EN)': en_summary,
        'Ссылка (RU)': f"https://ru.wikipedia.org/wiki/{'_'.join(ru_title.split())}" if ru_title else None,
        'Ссылка (EN)': f"https://en.wikipedia.org/wiki/{'_'.join(en_title.split())}" if en_title else None
    })

100%|██████████| 1399/1399 [14:03<00:00,  1.66it/s]


In [None]:
df = pd.DataFrame(data)
df.to_csv('linguistics.csv', index=False)

### Physics Terms

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

spans = soup.find_all("span", title="физическое")

for span in spans:
    parent_li = span.find_parent("li")
    if parent_li:
        print(parent_li.get_text())

In [None]:
import requests
from bs4 import BeautifulSoup

def get_term_details(term_url):
    response = requests.get(term_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find English translation
    en_translation = soup.find("span", lang="en")
    ru_title = search_wikipedia(term, 'ru')
    if en_translation:
        translation_text = en_translation.get_text()
    else:
        translation_text = get_interlanguage_link(ru_title, 'ru', 'en') if ru_title else None
    end_index = translation_text.find('(') if translation_text else None
    if end_index == -1:
        end_index = len(translation_text)
    if translation_text:
        translation_text = translation_text[:end_index]

    # Find the definition
    definition_span = soup.find("span", title="физическое")
    examples = soup.find_all("span", class_="example-block")
    if definition_span:
        parent_li = definition_span.find_parent("li")
        definition_text = parent_li.get_text() if parent_li else None
    elif len(examples) == 1:
        parent_li = examples[0].find_parent("li")
        definition_text = parent_li.get_text() if parent_li else None
    else:
        definition_text = get_wikipedia_summary(ru_title, 'ru') if ru_title else None

    return translation_text, definition_text

def get_terms_from_pages(base_url, start_path):
    current_path = start_path
    terms = []

    while current_path:
        current_url = base_url + current_path
        print(f"Обрабатываем {current_url}")
        response = requests.get(current_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        mw_pages_div = soup.find("div", id="mw-pages")
        if mw_pages_div:
            category_div = mw_pages_div.find("div", class_="mw-category mw-category-columns")
            if category_div:
                terms_links = category_div.find_all("a", href=True)
                for link in terms_links:
                    if '/wiki/' in link['href']:
                        terms.append((link.text, base_url + link['href']))

        next_page_link = mw_pages_div.find("a", string="Следующая страница") if mw_pages_div else None
        if next_page_link and next_page_link.get("href"):
            current_path = next_page_link.get("href")
        else:
            current_path = None

    return terms

In [None]:
data = []

base_url = "https://ru.wiktionary.org"
start_path = "/wiki/Категория:Физические_термины/ru"
terms = get_terms_from_pages(base_url, start_path)

for term, link in tqdm(terms):
    translation, definition = get_term_details(link)
    data.append({
        'Термин': term,
        'Определение (RU)': definition,
        'Перевод (EN)': translation,
        'Ссылка (RU)': f"https://ru.wiktionary.org/wiki/{'_'.join(term.split())}",
        'Ссылка (EN)': f"https://en.wiktionary.org/wiki/{'_'.join(translation.split(', ')[0].split())}" if translation else None
    })

Обрабатываем https://ru.wiktionary.org/wiki/Категория:Физические_термины/ru
Обрабатываем https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%A4%D0%B8%D0%B7%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru&pagefrom=%D0%B2%D1%8B%D1%80%D0%BE%D0%B6%D0%B4%D0%B5%D0%BD%D0%BD%D1%8B%D0%B9%0A%D0%B2%D1%8B%D1%80%D0%BE%D0%B6%D0%B4%D0%B5%D0%BD%D0%BD%D1%8B%D0%B9#mw-pages
Обрабатываем https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%A4%D0%B8%D0%B7%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru&pagefrom=%D0%B8%D0%B7%D0%BE%D1%84%D0%BE%D1%82%D0%B0%0A%D0%B8%D0%B7%D0%BE%D1%84%D0%BE%D1%82%D0%B0#mw-pages
Обрабатываем https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%A4%D0%B8%D0%B7%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru&pagefro

100%|██████████| 1282/1282 [09:38<00:00,  2.22it/s]


In [None]:
df = pd.DataFrame(data)
df.to_csv('physics.csv', index=False)

In [None]:
csv_file = 'physics.csv'
df = pd.read_csv(csv_file)

term_definitions = {}

for index, row in df.iterrows():
    term_en = row['Перевод (EN)']
    term_ru = row['Термин']
    excerpt_en = get_wikipedia_summary(term_en, lang='en')
    excerpt_ru = get_wikipedia_summary(term_ru, lang='ru')
    term_definitions[term_en] = [excerpt_en, excerpt_ru]

In [23]:
with open('term_definitions.json', 'w') as file:
    json.dump(term_definitions, file)

In [5]:
with open('term_definitions.json', 'r') as file:
    term_definitions = json.load(file)

In [10]:
import sqlite3

conn = sqlite3.connect('phys_ParaCrawl.db')
cursor = conn.cursor()

# Update the terms table with the fetched definitions
for term, definition in term_definitions.items():
    cursor.execute('''
        UPDATE terms
        SET excerpt_en = ?, excerpt_ru = ?
        WHERE translation_en = ?
    ''', (definition[0], definition[1], term))

conn.commit()
conn.close()

print("Database updated successfully.")

Database updated successfully.


## Creating a database with terms

In [4]:
import sqlite3
import csv

#db_path = 'ling_ParaCrawl.db'
#db_path = 'ling_ted.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
#db_path = 'ling_cyberleninka.db'
db_path = 'phys_cyberleninka.db'

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Creating a table with terms
cursor.execute('''
CREATE TABLE IF NOT EXISTS terms (
    id INTEGER PRIMARY KEY,
    term TEXT,
    definition_ru TEXT,
    translation_en TEXT,
    excerpt_ru TEXT,
    excerpt_en TEXT,
    link_ru TEXT,
    link_en TEXT
)
''')

# Creating a table with sentences and their translations
cursor.execute('''
CREATE TABLE IF NOT EXISTS sentences (
    id INTEGER PRIMARY KEY,
    context_ru TEXT,
    context_en TEXT,
    lemmas_ru TEXT,
    lemmas_en TEXT
)
''')

# Creating a table with terms and their contexts
cursor.execute('''
CREATE TABLE IF NOT EXISTS contexts (
    id INTEGER PRIMARY KEY,
    term_id INTEGER,
    sentence_id INTEGER,
    FOREIGN KEY (term_id) REFERENCES terms (id),
    FOREIGN KEY (sentence_id) REFERENCES sentences (id)
)
''')

conn.commit()

In [None]:
def import_terms_from_csv(csv_file_path):
    with open(csv_file_path, encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            cursor.execute('''
            INSERT INTO terms (term, definition_ru, translation_en, excerpt_ru, excerpt_en, link_ru, link_en)
            VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (row['Термин'], row['Определение (RU)'], row['Перевод (EN)'], row['Отрывок из Википедии (RU)'], row['Отрывок из Википедии (EN)'], row['Ссылка (RU)'], row['Ссылка (EN)']))
    conn.commit()

csv_path = 'linguistics.csv'
import_terms_from_csv(csv_path)
conn.close()

In [5]:
def import_terms_from_csv(csv_file_path):
    with open(csv_file_path, encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            cursor.execute('''
            INSERT INTO terms (term, definition_ru, translation_en, link_ru, link_en)
            VALUES (?, ?, ?, ?, ?)
            ''', (row['Термин'], row['Определение (RU)'], row['Перевод (EN)'], row['Ссылка (RU)'], row['Ссылка (EN)']))
    conn.commit()

csv_path = 'physics.csv'
import_terms_from_csv(csv_path)
conn.close()

## Извлечение контекстов и их переводов для выбранных терминов

### Функция лемматизации текста

In [4]:
#! pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=784da1e545170296e2bd23becd273aa2de705aeeb56d4578a4fb11e82ac27b00
  Stored in directory: /root/.

In [5]:
from collections import defaultdict
import nltk
import spacy
import pymorphy2
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

# Loading SpaCy models
nlp_en = spacy.load("en_core_web_sm")

# Initializing tools for Russian language
morph_ru = pymorphy2.MorphAnalyzer()
tokenizer = RegexpTokenizer('\w+')

# Stop words for both languages
stops_ru = set(stopwords.words('russian'))
stops_en = set(stopwords.words('english'))

def lemmatize(text, lang='en'):
    # For English language
    if lang == 'en':
        doc = nlp_en(text)
        lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']
    # For Russian language
    elif lang == 'ru':
        tokens = tokenizer.tokenize(text.lower())
        lemmas = [morph_ru.parse(word)[0].normal_form for word in tokens if word not in stops_ru]
    else:
        raise ValueError("Unsupported language")
    return lemmas

text_en = "Dogs are running in the park."
text_ru = "Собаки бегают в парке."

lemmas_en = lemmatize(text_en, lang='en')
lemmas_ru = lemmatize(text_ru, lang='ru')

print("EN lemmas:", lemmas_en)
print("RU lemmas:", lemmas_ru)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


EN lemmas: ['dogs', 'run', 'park']
RU lemmas: ['собака', 'бегать', 'парка']


### Reading a TMX file

In [None]:
import xml.etree.ElementTree as ET

def write_sentences_to_file(tmx_file_path, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for event, elem in ET.iterparse(tmx_file_path, events=('end',)):
            if elem.tag.endswith('tu'):
                source_text, target_text = None, None
                for tuv in elem:
                    if tuv.tag.endswith('tuv'):
                        lang = tuv.attrib['{http://www.w3.org/XML/1998/namespace}lang']
                        text = tuv.find('.//{*}seg').text
                        if lang.startswith('ru'):
                            source_text = text
                        elif lang.startswith('en'):
                            target_text = text

                if source_text and target_text:
                    output_file.write(f"{source_text}\t{target_text}\n")

                elem.clear()

#tmx_file_path = 'paracrawl_ru_en.tmx'
tmx_file_path = 'en-ru-ted.tmx'
output_file_path = 'ted_sentences.txt'
write_sentences_to_file(tmx_file_path, output_file_path)

### Data to csv

In [None]:
def get_last_processed_id(checkpoint_path):
    try:
        with open(checkpoint_path, 'r', encoding='utf-8') as f:
            last_id = int(f.read().strip())
    except FileNotFoundError:
        last_id = 0  # If the file is not found, start from the beginning
    return last_id

def save_last_processed_id(checkpoint_path, last_id):
    with open(checkpoint_path, 'w', encoding='utf-8') as f:
        f.write(str(last_id))

In [6]:
import csv
from tqdm import tqdm

def data_to_csv(file_path, csv_path, checkpoint_path):
    start_id = get_last_processed_id(checkpoint_path) + 1  # Continue from the last saved ID

    with open(file_path, 'r', encoding='utf-8') as file, \
         open(csv_path, 'a', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter='\t')

        # csv_writer.writerow(['id', 'context_ru', 'context_en', 'lemmas_ru', 'lemmas_en'])

        next(file)  # Skip the header

        flush_every = 1000  # Set the interval for forced recording
        line_count = 0  # Counter of processed lines

        for _ in range(start_id - 1):
            next(file)

        for i, line in enumerate(tqdm(file, initial=start_id), start=start_id):
            context_ru, context_en = line.strip().split('\t', 1)
            lemmas_ru = ' '.join(lemmatize(context_ru, lang='ru'))
            lemmas_en = ' '.join(lemmatize(context_en))
            csv_writer.writerow([i, context_ru, context_en, lemmas_ru, lemmas_en])

            line_count += 1
            if line_count % flush_every == 0:
                csv_file.flush()  # Force writing data to disk
                save_last_processed_id(checkpoint_path, i)  # Save the last processed ID
        csv_file.flush()

        save_last_processed_id(checkpoint_path, i)  # Save last line ID on exit

In [7]:
#file_path = 'output_sentences.txt'
#file_path = 'ted_sentences.txt'
#file_path = 'translated_ling.txt'
file_path = 'translated_phys.txt'
#csv_path = 'PC_contexts.csv'
#csv_path = 'ted_contexts.csv'
#csv_path = 'CL_ling_contexts.csv'
csv_path = 'CL_phys_contexts.csv'
checkpoint_path = 'checkpoint.txt'
data_to_csv(file_path, csv_path, checkpoint_path)

5918it [01:42, 57.92it/s]


### CSV to DB

In [8]:
import sqlite3
from tqdm import tqdm
import csv

def csv_to_db(csv_path, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    total_lines = sum(1 for line in open(csv_path, 'r', encoding='utf-8')) - 1  # Subtract one header line

    with open(csv_path, 'r', encoding='utf-8') as csv_file:
        reader = csv.reader(csv_file, delimiter='\t')
        next(reader)  # Skip the header

        for line in tqdm(reader, total=total_lines, unit='line'):
            i, context_ru, context_en, lemmas_ru, lemmas_en = line
            cursor.execute('INSERT INTO sentences (id, context_ru, context_en, lemmas_ru, lemmas_en) VALUES (?, ?, ?, ?, ?)',
             (i, context_ru, context_en, lemmas_ru, lemmas_en))

            if (total_lines + 1) % 1000 == 0:
                conn.commit()

    conn.commit()
    conn.close()

In [9]:
#db_path = 'ling_ParaCrawl.db'
#db_path = 'ling_ted.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
#db_path = 'ling_cyberleninka.db'
db_path = 'phys_cyberleninka.db'
#csv_path = 'PC_contexts.csv'
#csv_path = 'ted_contexts.csv'
#csv_path = 'CL_ling_contexts.csv'
csv_path = 'CL_phys_contexts.csv'
csv_to_db(csv_path, db_path)

100%|██████████| 5916/5916 [00:00<00:00, 18940.81line/s]


In [18]:
import sqlite3

def count_sentences(db_path, table_name):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    query = f'SELECT COUNT(*) FROM {table_name}'
    cursor.execute(query)
    count = cursor.fetchone()[0]

    cursor.close()
    conn.close()

    return count

In [None]:
#db_path = 'ling_ParaCrawl.db'
#db_path = 'phys_ParaCrawl.db'
db_path = 'ling_cyberleninka.db'
table_name = 'sentences'
sentence_count = count_sentences(db_path, table_name)
print(f"Total sentences in the table '{table_name}': {sentence_count}")

### Applying lemmatization and creating a reverse index



In [10]:
from collections import defaultdict

def create_reverse_index_from_db(sentences_with_ids):
    reverse_index = defaultdict(list)

    for sentence_id, lemmas_ru in sentences_with_ids:
        for lemma in lemmas_ru.split():
            reverse_index[lemma].append(sentence_id)

    return reverse_index

### Using a reverse index


In [11]:
import sqlite3

def get_source_text_with_ids(db_path, limit=100000, offset=0):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('SELECT id, lemmas_ru FROM sentences LIMIT ? OFFSET ?', (limit, offset))
    sentences_with_ids = cursor.fetchall()

    cursor.close()
    conn.close()

    return sentences_with_ids

In [12]:
# Retrieving data from the database
#db_path = 'ling_ParaCrawl.db'
#db_path = 'ling_ted.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
#db_path = 'ling_cyberleninka.db'
db_path = 'phys_cyberleninka.db'

sentences_with_ids = get_source_text_with_ids(db_path, limit=500000)
print(f"Loaded {len(sentences_with_ids)} pairs of IDs and Russian sentences.")

Loaded 5916 pairs of IDs and Russian sentences.


In [13]:
def process_all_data_and_collect(db_path):
    batch_size = 500000
    offset = 0
    all_sentences = []

    while True:
        data_batch = get_source_text_with_ids(db_path, limit=batch_size, offset=offset)

        if not data_batch:
            break

        all_sentences.extend(data_batch)

        offset += batch_size
        print(f"Processed {offset} rows so far...")

    return all_sentences

In [None]:
db_path = 'phys_ParaCrawl.db'
all_sentences = process_all_data_and_collect(db_path)
print(f"Total sentences collected: {len(all_sentences)}")

Processed 500000 rows so far...
Processed 1000000 rows so far...
Processed 1500000 rows so far...
Processed 2000000 rows so far...
Processed 2500000 rows so far...
Processed 3000000 rows so far...
Processed 3500000 rows so far...
Processed 4000000 rows so far...
Processed 4500000 rows so far...
Processed 5000000 rows so far...
Processed 5500000 rows so far...
Total sentences collected: 5377348


In [14]:
reverse_index = create_reverse_index_from_db(all_sentences)

In [15]:
print(len(reverse_index))

10957


In [16]:
from tqdm import tqdm

#db_path = 'ling_ParaCrawl.db'
#db_path = 'ling_ted.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
#db_path = 'ling_cyberleninka.db'
db_path = 'phys_cyberleninka.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute('SELECT id, term FROM terms')
terms = cursor.fetchall()

contexts_to_insert = []

for term_id, term in tqdm(terms):
    # Lemmatize the phrase
    term_lemmas = lemmatize(term)
    lemmatized_term = ' '.join(term_lemmas)

    # Find all unique sentences that contain at least one word from the phrase
    potential_sentence_ids = set()
    for word in term_lemmas:
        if word in reverse_index:
            potential_sentence_ids.update(reverse_index[word])

    # Check each potential sentence for the entire phrase
    for sentence_id in potential_sentence_ids:
        cursor.execute('SELECT lemmas_ru FROM sentences WHERE id = ?', (sentence_id,))
        sentence = cursor.fetchone()[0]
        if lemmatized_term in sentence:
            contexts_to_insert.append((term_id, sentence_id))

cursor.executemany('INSERT INTO contexts (term_id, sentence_id) VALUES (?, ?)', contexts_to_insert)

conn.commit()
conn.close()

100%|██████████| 1282/1282 [00:19<00:00, 67.47it/s]


In [19]:
#db_path = 'ling_ParaCrawl.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
#db_path = 'ling_cyberleninka.db'
db_path = 'phys_cyberleninka.db'
table_name = 'contexts'
sentence_count = count_sentences(db_path, table_name)
print(f"Total sentences in the table '{table_name}': {sentence_count}")

Total sentences in the table 'contexts': 10068


### Filtering contexts by translations



In [64]:
import sqlite3

#db_path = 'ling_ParaCrawl.db'
#db_path = 'ling_ted.db'
#db_path = 'phys_ParaCrawl.db'
#db_path = 'phys_ted.db'
db_path = 'ling_cyberleninka.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [66]:
cursor.execute("SELECT id, translation_en FROM terms")
data_from_db = cursor.fetchall()

In [68]:
lemmatized_terms = [(id, ' '.join(lemmatize(translation_en))) for id, translation_en in data_from_db]

cursor.execute("CREATE TEMPORARY TABLE en_lemmas (id INTEGER, lemmas_en TEXT)")
cursor.executemany("INSERT INTO en_lemmas (id, lemmas_en) VALUES (?, ?)", lemmatized_terms)

<sqlite3.Cursor at 0x7c5688eb43c0>

In [69]:
cursor.execute("""
SELECT c.*
FROM contexts c
JOIN terms t ON c.term_id = t.id
JOIN sentences s ON c.sentence_id = s.id
JOIN en_lemmas el ON t.id = el.id
WHERE s.lemmas_en LIKE '%' || el.lemmas_en || '%'
""")

<sqlite3.Cursor at 0x7c5688eb43c0>

In [70]:
filtered_contexts = cursor.fetchall()

if filtered_contexts:
    cursor.execute("DELETE FROM contexts")

    placeholders = ', '.join(['?'] * len(filtered_contexts[0]))
    insert_query = f"INSERT INTO contexts VALUES ({placeholders})"

    cursor.executemany(insert_query, filtered_contexts)

    conn.commit()

In [71]:
cursor.execute("SELECT * FROM contexts")
new_contexts = cursor.fetchall()

In [72]:
len(new_contexts)

7307

### Перевод тематических текстов

In [6]:
import re
from tqdm import tqdm

def preprocess_article(article):
    # Removing square brackets and their content
    article = re.sub(r'\[.*?\]', '', article)

    # Removing lines that start with a dotted number and end with a date
    article = re.sub(r'^\d+\..*?\d{4}\.', '', article, flags=re.MULTILINE)

    # Removing lines containing only Latin characters, punctuation and numbers
    article = re.sub(r'^[a-zA-Z0-9\s.,!?;-]*$', '', article, flags=re.MULTILINE)

    # Removing short lines (less than 10 characters)
    article = re.sub(r'^.{1,9}$', '', article, flags=re.MULTILINE)

    # Search and remove all text up to "Key words:" or "Ключевые слова:", including that line up to the first dot
    match = re.search(r'\s(Key words:.*?\.|\nКлючевые слова:.*?\.)', article, flags=re.DOTALL)
    if match:
        start_pos = match.end()
        article = article[start_pos:]

    # Removing text starting with "СПИСОК ЛИТЕРАТУРЫ" or "ЛИТЕРАТУРА", including these lines
    article = re.split(r'\sСПИСОК ЛИТЕРАТУРЫ\n\s*|\sЛИТЕРАТУРА\n\s*|\sЛИТЕРАТУРА\n\s*|\sБИБЛИОГРАФИЧЕСКИЙ СПИСОК\n\s*|\sПРИМЕЧАНИЯ\n\s*', article, flags=re.IGNORECASE | re.DOTALL, maxsplit=1)[0]

    # Removing empty paragraphs
    article = re.sub(r'\n\s*\n', '\n', article)

    # Concatenate lines where the line does not end with punctuation with the next line
    article = re.sub(r'([^\.\?\!])\n', r'\1 ', article)

    # Joining words separated by a hyphen
    article = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', article)
    article = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', article)

    # Removing extra line breaks and spaces
    article = re.sub(r'\s+', ' ', article).strip()

    return article

In [8]:
def process_file(file_path, output_path):
    with open(file_path, 'r', encoding='utf-8') as file, \
         open(output_path, 'w', encoding='utf-8') as output_file:

        article = ""
        article_started = False  # Flag to track the beginning of the article
        for line in tqdm(file):
            if line.strip() == "=====":
                if article_started and article:
                    processed_article = preprocess_article(article)
                    output_file.write(processed_article + "\n\n")
                    article = ""
                article_started = True  # Start a new article after meeting the separator
            else:
                if article_started:
                    article += line

        if article_started and article:
            processed_article = preprocess_article(article)
            output_file.write(processed_article + "\n\n")

#file_path = 'all_cyberleninka_lang2.txt'
file_path = 'all_cyberleninka_physics2.txt'
#output_path = 'preprocessed_ling.txt'
output_path = 'preprocessed_phys.txt'

process_file(file_path, output_path)

10605680it [05:41, 31016.60it/s]


In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

#input_file_path = 'preprocessed_ling.txt'
input_file_path = 'preprocessed_phys.txt'
#intermediate_file_path = 'sentences_ling.txt'
intermediate_file_path = 'sentences_phys.txt'

with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(intermediate_file_path, 'w', encoding='utf-8') as intermediate_file:
    for line in input_file:
        line = line.strip()
        if line:  # Skip empty lines
            sentences = sent_tokenize(line, language='russian')
            for sentence in sentences:
                intermediate_file.write(sentence + "\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def is_valid_sentence(sentence):
    if re.search(r'\b(?:19|20)\d{2}\b', sentence):  # Looking for years
        return False
    if re.search(r'\b№\s*\d+', sentence):  # Looking for numbers
        return False
    if re.search(r'\bС\.\s*\d+', sentence):  # Looking for links to pages
        return False
    if re.search(r'^\d+\.$', sentence.strip()):  # Looking for lines consisting only of a number with a dot
        return False
    if re.match(r'^\d+\.', sentence.strip()):  # Looking for lines starting with a number and a period
        return False
    return True

#input_file_path = 'sentences_ling.txt'
input_file_path = 'sentences_phys.txt'
#output_file_path = 'filtered_ling.txt'
output_file_path = 'filtered_phys.txt'

with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(output_file_path, 'w', encoding='utf-8') as output_file:
    for line in input_file:
        if is_valid_sentence(line):
            output_file.write(line)

In [4]:
!pip install nltk transformers



In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

src_lang = "ru"
tgt_lang = "en"
tokenizer.src_lang = src_lang

def translate(text):
    encoded = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [None]:
translate('Особого внимания в газете требуют к себе такие речевые образования, как заголовки, подзаголовки, вводки, рубрики.')

'Special attention in the newspaper requires such language education as headlines, subtitles, introductions, rubrics.'

In [6]:
from tqdm import tqdm
import os

#translated_file_path = 'translated_ling.txt'
translated_file_path = 'translated_phys.txt'

lines_already_translated = 0
if os.path.exists(translated_file_path):
    with open(translated_file_path, 'r', encoding='utf-8') as file:
        lines_already_translated = sum(1 for _ in file)

In [7]:
lines_already_translated

8071

In [None]:
#filtered_file_path = 'filtered_ling.txt'
filtered_file_path = 'filtered_phys.txt'

with open(filtered_file_path, 'r', encoding='utf-8') as intermediate_file, \
     open(translated_file_path, 'a', encoding='utf-8') as translated_file:
    if lines_already_translated > 0:
        for _ in range(lines_already_translated):
            next(intermediate_file)  # Skip already translated lines
    for sentence in tqdm(intermediate_file):
        sentence = sentence.strip()
        if sentence:
            translation = translate(sentence)
            translated_file.write(sentence + "\t" + translation + "\n")

1973it [6:49:43, 20.63s/it]

### Statistics

In [5]:
import sqlite3
import pandas as pd
import os
from tqdm import tqdm

def compute_statistics(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # 1. Count the total number of tokens in Russian lemmas
    query = "SELECT lemmas_ru FROM sentences"
    cursor.execute(query)
    russian_lemmas = cursor.fetchall()

    print('Russian lemmas\n')
    total_russian_tokens = 0
    for row in tqdm(russian_lemmas):
        lemmas = row[0]
        if lemmas:
            tokens = lemmas.split()
            total_russian_tokens += len(tokens)

    # 2. Count the total number of tokens in English lemmas
    query = "SELECT lemmas_en FROM sentences"
    cursor.execute(query)
    english_lemmas = cursor.fetchall()

    print('English lemmas\n')
    total_english_tokens = 0
    for row in tqdm(english_lemmas):
        lemmas = row[0]
        if lemmas:
            tokens = lemmas.split()
            total_english_tokens += len(tokens)

    # 3. Calculate the average number of contexts per term
    query = "SELECT term_id, COUNT(*) as context_count FROM contexts GROUP BY term_id"
    cursor.execute(query)
    contexts_per_term = cursor.fetchall()
    total_terms_with_contexts = len(contexts_per_term)
    total_contexts = sum([count for term_id, count in contexts_per_term])

    average_contexts_per_term = total_contexts / total_terms_with_contexts if total_terms_with_contexts > 0 else 0

    # 4. Calculate the average number of tokens in Russian lemmas and English lemmas
    query = "SELECT lemmas_ru, lemmas_en FROM sentences"
    cursor.execute(query)
    lemmas = cursor.fetchall()

    total_russian_token_count = 0
    total_english_token_count = 0
    russian_lemma_count = 0
    english_lemma_count = 0

    print('Average # of lemmas\n')
    for lemmas_ru, lemmas_en in tqdm(lemmas):
        if lemmas_ru:
            russian_tokens = lemmas_ru.split()
            total_russian_token_count += len(russian_tokens)
            russian_lemma_count += 1
        if lemmas_en:
            english_tokens = lemmas_en.split()
            total_english_token_count += len(english_tokens)
            english_lemma_count += 1

    average_russian_tokens = total_russian_token_count / russian_lemma_count if russian_lemma_count > 0 else 0
    average_english_tokens = total_english_token_count / english_lemma_count if english_lemma_count > 0 else 0

    # 5. Count the number of terms that have contexts
    query = "SELECT COUNT(DISTINCT term_id) FROM contexts"
    cursor.execute(query)
    terms_with_contexts_count = cursor.fetchone()[0]

    conn.close()

    stats = {
        'Database': [db_path],
        'Total Russian Tokens': [total_russian_tokens],
        'Total English Tokens': [total_english_tokens],
        'Average Contexts per Term': [average_contexts_per_term],
        'Average Russian Tokens per Lemma': [average_russian_tokens],
        'Average English Tokens per Lemma': [average_english_tokens],
        'Number of Terms with Contexts': [terms_with_contexts_count]
    }

    return stats

def save_statistics_to_csv(stats, csv_output):
    df_stats = pd.DataFrame(stats)

    if os.path.exists(csv_output):
        df_existing = pd.read_csv(csv_output)
        df_combined = pd.concat([df_existing, df_stats], ignore_index=True)
        df_combined.to_csv(csv_output, index=False)
    else:
        df_stats.to_csv(csv_output, index=False)

    print(f"Statistics saved to {csv_output}")

In [6]:
db_paths = ['phys_ted.db', 'ling_cyberleninka.db', 'phys_cyberleninka.db']  # List of database paths
csv_output = 'all_stats.csv'

for db_path in db_paths:
    stats = compute_statistics(db_path)
    save_statistics_to_csv(stats, csv_output)

Russian lemmas



100%|██████████| 380267/380267 [00:00<00:00, 695326.27it/s]


English lemmas



100%|██████████| 380267/380267 [00:00<00:00, 1146006.29it/s]


Average # of lemmas



100%|██████████| 380267/380267 [00:00<00:00, 458250.73it/s]


Statistics saved to all_stats.csv
Russian lemmas



100%|██████████| 11031/11031 [00:00<00:00, 385048.00it/s]


English lemmas



100%|██████████| 11031/11031 [00:00<00:00, 634825.71it/s]


Average # of lemmas



100%|██████████| 11031/11031 [00:00<00:00, 277071.67it/s]

Statistics saved to all_stats.csv





Russian lemmas



100%|██████████| 5916/5916 [00:00<00:00, 243324.50it/s]


English lemmas



100%|██████████| 5916/5916 [00:00<00:00, 465352.06it/s]


Average # of lemmas



100%|██████████| 5916/5916 [00:00<00:00, 232658.58it/s]

Statistics saved to all_stats.csv



