In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transliterate import translit
from googletrans import Translator

import gzip

## Load Data

In [8]:
METADATA_DIR = '/home/zxia15/data_zxia15/russian-semantics/work/metadata_russian_documents.jsonl.gz'
DUMP_FILTERED_METADATA_DIR = '/home/zxia15/data_zxia15/russian-semantics/work/filtered_metadata_russian_documents.jsonl.gz'
metadata_df = pd.read_json(METADATA_DIR, lines=True)
metadata_df.head()

Unnamed: 0,title,htid,russian text ratio,author,year,form
0,"I͡Ubileĭnyĭ sbornik Literaturnago fonda, 1859-...",mdp.39015028061110,99.376453,,,
1,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",mdp.39015009128698,98.944547,"Merezhkovsky, Dmitry Sergeyevich,",1941.0,
2,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",pst.000001808827,98.550891,"Merezhkovsky, Dmitry Sergeyevich,",1941.0,
3,Poėty i poėtessy /,mdp.39015006976008,99.458652,"Aĭkhenvalʹd, I︠U︡. I.",1928.0,nonfiction
4,Poėty i poėtessy /,uc1.b3798780,99.17379,"Aĭkhenvalʹd, I︠U︡. I.",1928.0,nonfiction


## Filtering 01: Get Rid of all Instances without years

In [3]:
filted_stage1_metadata_df = metadata_df[~metadata_df['year'].isna()]

In [4]:
filted_stage1_metadata_df.head()

Unnamed: 0,title,htid,russian text ratio,author,year,form
1,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",mdp.39015009128698,98.944547,"Merezhkovsky, Dmitry Sergeyevich,",1941.0,
2,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",pst.000001808827,98.550891,"Merezhkovsky, Dmitry Sergeyevich,",1941.0,
3,Poėty i poėtessy /,mdp.39015006976008,99.458652,"Aĭkhenvalʹd, I︠U︡. I.",1928.0,nonfiction
4,Poėty i poėtessy /,uc1.b3798780,99.17379,"Aĭkhenvalʹd, I︠U︡. I.",1928.0,nonfiction
5,Robert Maĭer /,uc1.b4253266,98.923383,"Zami͡atin, Evgeniĭ Ivanovich,",1937.0,nonfiction


In [5]:
len(metadata_df), len(filted_stage1_metadata_df)

(89353, 35306)

In [9]:
json_str = filted_stage1_metadata_df.to_json(orient='records', lines=True)

# Compress the JSON Lines string and write to a file
with gzip.open(DUMP_FILTERED_METADATA_DIR, 'wt', encoding='utf-8') as f:
    f.write(json_str)

## Filtering 02: Get Rid of all repeated instances

In [11]:
filted_stage1_metadata_df = pd.read_json(DUMP_FILTERED_METADATA_DIR, lines=True)
filted_stage1_metadata_df.head()

Unnamed: 0,title,htid,russian text ratio,author,year,form
0,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",mdp.39015009128698,98.944547,"Merezhkovsky, Dmitry Sergeyevich,",1941,
1,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",pst.000001808827,98.550891,"Merezhkovsky, Dmitry Sergeyevich,",1941,
2,Poėty i poėtessy /,mdp.39015006976008,99.458652,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction
3,Poėty i poėtessy /,uc1.b3798780,99.17379,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction
4,Robert Maĭer /,uc1.b4253266,98.923383,"Zami͡atin, Evgeniĭ Ivanovich,",1937,nonfiction


In [34]:
# check the instances of repetitions
filted_stage1_metadata_df['title_plus_year'] = filted_stage1_metadata_df.apply(lambda x : x['title'] + ' -- ' + str(x['year']), axis=1) 
title_counts = filted_stage1_metadata_df['title_plus_year'].value_counts()

In [35]:
# check all titles redundant over a particular threshold
thresholds = [0, 2, 5, 10, 20, 30, 50, 100, 200, 500]

for threshold in thresholds:
    filtered_counts = title_counts[title_counts > threshold]
    print(f"the number of titles with repeated works over {threshold} is {len(filtered_counts)}")

the number of titles with repeated works over 0 is 21651
the number of titles with repeated works over 2 is 1816
the number of titles with repeated works over 5 is 624
the number of titles with repeated works over 10 is 253
the number of titles with repeated works over 20 is 91
the number of titles with repeated works over 30 is 49
the number of titles with repeated works over 50 is 22
the number of titles with repeated works over 100 is 6
the number of titles with repeated works over 200 is 0
the number of titles with repeated works over 500 is 0


In [38]:
def convert_transliterated_russian_to_english(translit_rus : str) -> str:
    translit_rus_arr = translit_rus.split(' -- ')
    translit_rus_text, translit_rus_year = translit_rus_arr[0], translit_rus_arr[1]
    cyrillic_text = translit(translit_rus_text, 'ru')
    translator = Translator()
    translated = translator.translate(cyrillic_text, src='ru', dest='en')
    return f'{translated.text} -- {translit_rus_year}'

In [40]:
# check titles with repetition over 50
title_with_20_plus_reps = title_counts[title_counts > 20].reset_index()
title_with_20_plus_reps['title in english'] = title_with_20_plus_reps['title_plus_year'].map(lambda x : convert_transliterated_russian_to_english(x))
title_with_20_plus_reps

Unnamed: 0,title_plus_year,count,title in english
0,Works. -- 1901,151,Works. -- 1901
1,Polnoe sobranīe sochinenīĭ. -- 1881,139,The full assembly is composed. -- 1881
2,Polnoe sobranīe sochinenīĭ / -- 1900,123,Full collected capital composed. -- 1900
3,"Polnoe sobranīe sochinenīĭ, -- 1889",113,The full assembled capital is composed. -- 1889
4,Works. -- 1902,102,Works. -- 1902
5,Works. -- 1906,101,Works. -- 1906
6,Polnoe sobranie sochinenii . -- 1881,99,Complete collection of essay. -- 1881
7,Works. -- 1898,98,Works. -- 1898
8,Works. -- 1921,91,Works. -- 1921
9,"Polnoe sobranīe sochinenīĭ, -- 1910",72,The full assembled capital is composed. -- 1910


In [45]:
potential_repetitions = [14, 24, 25, 27, 29, 32, 40, 42, 49, 51, 52, 56, 57, 58, 59, 63, 74, 75, 76, 78, 79, 84, 86, 88]
potential_repetition_names = title_with_20_plus_reps.iloc[potential_repetitions]['title_plus_year']
potential_repetition_names = list(potential_repetition_names)

In [46]:
def get_query_index(target : str) -> int:
    try:
        return potential_repetition_names.index(target)
    except ValueError:
        return -1

In [47]:
# do the first layer of filtering
filted_stage1_metadata_df['query_num'] = filted_stage1_metadata_df['title_plus_year'].map(lambda x : get_query_index(x))
filted_stage1_metadata_df.head()

Unnamed: 0,title,htid,russian text ratio,author,year,form,title_plus_year,query_num
0,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",mdp.39015009128698,98.944547,"Merezhkovsky, Dmitry Sergeyevich,",1941,,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a. -- 1941",-1
1,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",pst.000001808827,98.550891,"Merezhkovsky, Dmitry Sergeyevich,",1941,,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a. -- 1941",-1
2,Poėty i poėtessy /,mdp.39015006976008,99.458652,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction,Poėty i poėtessy / -- 1928,-1
3,Poėty i poėtessy /,uc1.b3798780,99.17379,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction,Poėty i poėtessy / -- 1928,-1
4,Robert Maĭer /,uc1.b4253266,98.923383,"Zami͡atin, Evgeniĭ Ivanovich,",1937,nonfiction,Robert Maĭer / -- 1937,-1


In [72]:
# get the list of all indexes with no need for repetition change
filtered_index_01 = filted_stage1_metadata_df[filted_stage1_metadata_df['query_num'] == -1].index
filtered_index_01

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       35296, 35297, 35298, 35299, 35300, 35301, 35302, 35303, 35304, 35305],
      dtype='int64', length=34579)

In [73]:
# for all the rest, find the instance with the largest russian ratio and put it in
filtered_index_02 = []
for i in range(len(potential_repetition_names)):
    idx = i + 1
    filtering_df = filted_stage1_metadata_df[filted_stage1_metadata_df['query_num'] == idx]
    print(f"querying over title {potential_repetition_names[i]}, which has {len(filtering_df)} instances")
    if len(filtering_df) == 0:
        continue
    index_of_max_russian_ratio = filtering_df['russian text ratio'].idxmax()
    print(f"found idx {index_of_max_russian_ratio} has most russian text ratio: {filtering_df[filtering_df.index == index_of_max_russian_ratio]['russian text ratio']}")
    filtered_index_02.append(index_of_max_russian_ratio)

querying over title Sobranīe romanov, povi︠e︡steĭ i razskazov P.A. Boborykina. -- 1921, which has 45 instances
found idx 19291 has most russian text ratio: 19291    99.786291
Name: russian text ratio, dtype: float64
querying over title Istorīi͡a Rossīi s drevni͡eĭshikh vremen / -- 1879, which has 43 instances
found idx 5419 has most russian text ratio: 5419    99.822103
Name: russian text ratio, dtype: float64
querying over title Istorīi͡a Rossīi s drevni͡eĭshikh vremen. -- 1879, which has 41 instances
found idx 20311 has most russian text ratio: 20311    99.767382
Name: russian text ratio, dtype: float64
querying over title Polnoe sobranie sochinenīĭ F. M. Dostoevskago -- 1881, which has 41 instances
found idx 21805 has most russian text ratio: 21805    99.50727
Name: russian text ratio, dtype: float64
querying over title Polnoe sobranie sochinenīĭ A.F. Pisemskago. -- 1881, which has 38 instances
found idx 33231 has most russian text ratio: 33231    99.635408
Name: russian text ra

In [74]:
len(filtered_index_01), len(filtered_index_02)
filtered_index_01 = list(filtered_index_01)
filtered_index_01.extend(filtered_index_02)
filted_stage2_metadata_df = filted_stage1_metadata_df.iloc[filtered_index_01]

In [75]:
filted_stage2_metadata_df.head()

Unnamed: 0,title,htid,russian text ratio,author,year,form,title_plus_year,query_num
0,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",mdp.39015009128698,98.944547,"Merezhkovsky, Dmitry Sergeyevich,",1941,,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a. -- 1941",-1
1,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a.",pst.000001808827,98.550891,"Merezhkovsky, Dmitry Sergeyevich,",1941,,"Gogolʹ; tvorchestvo, zhiznʹ i religii͡a. -- 1941",-1
2,Poėty i poėtessy /,mdp.39015006976008,99.458652,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction,Poėty i poėtessy / -- 1928,-1
3,Poėty i poėtessy /,uc1.b3798780,99.17379,"Aĭkhenvalʹd, I︠U︡. I.",1928,nonfiction,Poėty i poėtessy / -- 1928,-1
4,Robert Maĭer /,uc1.b4253266,98.923383,"Zami͡atin, Evgeniĭ Ivanovich,",1937,nonfiction,Robert Maĭer / -- 1937,-1


In [84]:
len(filted_stage2_metadata_df), len(filtered_index_01)

(34602, 34602)

In [85]:
json_str = filted_stage2_metadata_df.to_json(orient='records', lines=True)

# Compress the JSON Lines string and write to a file
with gzip.open(DUMP_FILTERED_METADATA_DIR, 'wt', encoding='utf-8') as f:
    f.write(json_str)