In [47]:
import multiprocessing
from pathlib import Path
import shutil
import os
import fileinput
import re
import time
import pandas as pd

In [9]:
def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()
    print('after join')

main line
module name: __main__
parent process: 5292
process id: 5512
after join
function f
module name: __main__
parent process: 5512
process id: 18001
hello bob


In [50]:
def freqs(in_file, out_path):

    freqs = {}
    
    file_name = os.path.basename(in_file)

    with open(in_file, 'r') as in_file:
        text = in_file.readlines()

        for line in text:
            line = line.split()
            for word in line:
                if word not in freqs.keys():
                    freqs[word] = 1
                else:
                    freqs[word] += 1

    freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
    df = pd.DataFrame(freqs.items(), columns=['Lemma', 'Count'])

    with open(f"{out_path}/freqs_{file_name}.tsv", 'w+') as out_file:
        df.to_csv(out_file, sep='\t')

    return freqs

def split(in_file, out_path, split_len):
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    if os.path.isdir(save_dir):
        shutil.rmtree(save_dir)
    
    os.mkdir(save_dir)
       
    with open(in_file, 'r') as in_file:
        
        data = in_file.readlines()
        num_lines = len(data)
    
        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            if len(text_data) == split_len:
                save_path = f'{save_dir}/{file_name}_{file_count}.txt'
                
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))

                text_data = []
                file_count += 1
    
    paths = [str(x) for x in Path(f'{out_path}/{file_name}_parts').glob('**/*.txt')]
    return paths

def find_rare(freqs, threshold):

    rare = []
    for key,value in freqs.items():
        if value <= threshold:
            rare.append(key)
    rare_regex_string = ' '+' | '.join(rare)+' '
    rare_regex = re.compile(rare_regex_string)
    
    return rare_regex

def filter_rare(in_file, file_code, rare_regex):
    start = time.time()
    print(f"{file_code} - {os.getpid()} Filtering words from {in_file}")
    
    with fileinput.FileInput(in_file, inplace=True, backup='.bak') as file:
        for line in file:
            print(rare_regex.sub(' ', line), end='')
            
    print(f"{file_code} - {os.getpid()} end filtering {time.time()-start}")
    
def join_files(in_dir, out_path, file_name):
    files = [str(x) for x in Path(f'{in_dir}').glob('**/*.txt')]
    
    if os.path.isfile(f"{out_path}/joined_{file_name}.txt"):
        os.remove(f"{out_path}/joined_{file_name}.txt")
    
    with open(f"{out_path}/joined_{file_name}.txt", 'w+', encoding='utf-8') as out_file:
        for path in files:
            with open(path, 'r') as in_file:
                for line in in_file:
                    out_file.write(line)
                       
def run(in_file, out_path, split_len, threshold):
    freqs_dict = freqs(in_file, out_path)
    rare_regex = find_rare(freqs_dict, threshold)
    splits = split(in_file, out_path, split_len)
    
    pool = multiprocessing.Pool() #use all available cores, otherwise specify the number you want as an argument
    i=0
    for path in splits:
        pool.apply_async(filter_rare, args=(path, i, rare_regex))
        i+=1
    pool.close()
    pool.join()
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    join_files(save_dir, out_path, file_name)

In [51]:
run('/home/edo/projects/usage_change_ITA/data/days_2020/processed/days_2020_sentencesL_prepost.txt', './test', 5000, 10)

1 - 5363 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_169.txt
2 - 5364 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_125.txt
0 - 5362 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_134.txt
3 - 5365 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_188.txt
10 - 5372 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_119.txt
4 - 5366 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_141.txt
13 - 5375 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_47.txt
12 - 5374 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_183.txt
6 - 5368 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days

Process ForkPoolWorker-11:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Process ForkPoolWorker-16:
Process ForkPoolWorker-14:
Process ForkPoolWorker-10:
Process ForkPoolWorker-1:
Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Process ForkPoolWorker-12:
Process ForkPoolWorker-8:
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Process ForkPoolWorker-9:
Process ForkPoolWorker-15:
Process ForkPoolWorker-13:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/edo/miniconda3/envs/align/lib/python3.

  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
KeyboardInterrupt
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(

KeyboardInterrupt: 

32 - 6006 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_105.txt
33 - 6007 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_8.txt
35 - 6009 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_199.txt
34 - 6008 Filtering words from test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_91.txt


Process ForkPoolWorker-17:
Process ForkPoolWorker-20:
Traceback (most recent call last):
Process ForkPoolWorker-18:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-19:
  File "/home/edo/miniconda3/envs/align/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/edo/miniconda3/envs/align/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/edo/miniconda3/envs/align/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/edo/miniconda3/envs/align/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/edo/miniconda3/envs/align/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/tmp/ipykernel_3106/3638725342.py", line 76, in filter_rare
    print(rare_regex.sub(' ', line), end='')
KeyboardInterrupt
Tra

In [32]:
filter_rare(
    '/home/edo/projects/usage_change_ITA/test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_2.txt',
    'documento|pagare')

Filtering words from /home/edo/projects/usage_change_ITA/test/days_2020_sentencesL_prepost.txt_parts/days_2020_sentencesL_prepost.txt_2.txt. Pid: 3106
