In [1]:
from spacy.lang.ru import Russian
import pandas as pd

In [None]:
nlp = Russian()
nlp.add_pipe('sentencizer')

In [2]:
rus = Russian()
tokenizer = rus.tokenizer

In [3]:
def process_txt(x):
    if x != '\n':
        x = x.replace('\n', '')
        return x

In [3]:
def process(section):
    res = []
    doc = nlp(section)
    for sent in doc.sents:
        sent = str(sent)
        x = sent.replace('\n', '')
        if x:
            res.append([x])
    return res

In [4]:
def read_in_chunks_lines(file_object, chunk_size=5000000):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = file_object.readlines(chunk_size)
        if not data:
            break
        yield data

In [9]:
%%time
words = 0
sents = 0
words_wo_punct = 0
words_wo_stops = 0
words_wo_stops_punct = 0

for chunk in pd.read_csv('Исходники\geo_wac1.csv', header=None, names = ['Text'], encoding='utf-8',
                             low_memory=True, chunksize=1000000):
    chunk = chunk.dropna()
    df_geo = chunk.Text.to_list()
    geo_sents = []    
    
    for text in df_geo:
        sent = process(text)
        geo_sents.extend(sent)
        
    geo_sents = [item for sublist in geo_sents for item in sublist]
    sents += len(geo_sents)
    
    for piece in geo_sents:
        if piece:            
            doc = tokenizer(piece)
            words += len([w.text for w in doc])
            words_wo_punct += len([token for token in doc if not token.is_punct])
            words_wo_stops += len([token.text for token in doc if not token.is_stop])
            words_wo_stops_punct += len([token for token in doc if not token.is_stop and not token.is_punct])
        
print(f'Subtotal: {sents} sentences before deduplication')
print(f'Subtotal: {words} words before deduplication')  
print(f'Subtotal: {words_wo_punct} tokens without punctuation')
print(f'Subtotal: {words_wo_stops} tokens without stops')
print(f'Subtotal: {words_wo_stops_punct} tokens without stops and punctuation') 

Subtotal: 19440995 sentences before deduplication
Subtotal: 323572103 words before deduplication
Subtotal: 268214037 tokens without punctuation
Subtotal: 244854196 tokens without stops
Subtotal: 189496130 tokens without stops and punctuation
Wall time: 1h 32min 15s


In [10]:
del chunk
del geo_sents
del df_geo

In [6]:
%%time
words = 0
sents = 0
words_wo_punct = 0
words_wo_stops = 0
words_wo_stops_punct = 0


with open('Ready\wiki3.txt', encoding='utf-8') as f1:
        for piece in read_in_chunks_lines(f1):
            geo_sents = [process_txt(x) for x in piece if x != '\n']            
            sents += len(geo_sents)

            for piece in geo_sents:
                if piece:
                    doc = tokenizer(piece)
                    words += len([w.text for w in doc])
                    words_wo_punct += len([token for token in doc if not token.is_punct])
                    words_wo_stops += len([token.text for token in doc if not token.is_stop])
                    words_wo_stops_punct += len([token for token in doc if not token.is_stop and not token.is_punct])

print(f'Subtotal: {sents} sentences after deduplication')
print(f'Subtotal: {words} words after deduplication')  
print(f'Subtotal: {words_wo_punct} tokens without punctuation')
print(f'Subtotal: {words_wo_stops} tokens without stops')
print(f'Subtotal: {words_wo_stops_punct} tokens without stops and punctuation') 

Subtotal: 5244463 sentences after deduplication
Subtotal: 106463131 words after deduplication
Subtotal: 80912566 tokens without punctuation
Subtotal: 92835325 tokens without stops
Subtotal: 67284760 tokens without stops and punctuation
Wall time: 8min 14s
