In [1]:
import multiprocessing
from pathlib import Path
import shutil
import os
import fileinput
import re
import time
import pandas as pd

In [9]:
def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

def f(name):
    info('function f')
    print('hello', name)

if __name__ == '__main__':
    info('main line')
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()
    print('after join')

main line
module name: __main__
parent process: 5292
process id: 5512
after join
function f
module name: __main__
parent process: 5512
process id: 18001
hello bob


In [16]:
def freqs(in_file, out_path):

    freqs = {}
    
    file_name = os.path.basename(in_file)

    with open(in_file, 'r') as in_file:
        text = in_file.readlines()

        for line in text:
            line = line.split()
            for word in line:
                if word not in freqs.keys():
                    freqs[word] = 1
                else:
                    freqs[word] += 1

    freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
    df = pd.DataFrame(freqs.items(), columns=['Lemma', 'Count'])

    with open(f"{out_path}/freqs_{file_name}.tsv", 'w+') as out_file:
        df.to_csv(out_file, sep='\t')

    return freqs

def split(in_file, out_path, split_len):
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    if os.path.isdir(save_dir):
        shutil.rmtree(save_dir)
    
    os.mkdir(save_dir)
       
    with open(in_file, 'r') as in_file:
        
        data = in_file.readlines()
        num_lines = len(data)
    
        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            if len(text_data) == split_len:
                save_path = f'{save_dir}/{file_name}_{file_count}.txt'
                
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))

                text_data = []
                file_count += 1
    
    paths = [str(x) for x in Path(f'{out_path}/{file_name}_parts').glob('**/*.txt')]
    return paths

def find_rare(freqs, threshold, custom=[]):

    rare = []
    for key,value in freqs.items():
        if value <= threshold:
            rare.append(key)
        for word in custom:
            rare.append(word)
    rare_regex_string = '[^a-zA-Z0-9]'+' '+' | '.join(rare)+' '
    rare_regex = re.compile(rare_regex_string)
    
    return rare_regex

def filter_rare(in_file, file_code, rare_regex):
    start = time.time()
    print(f"{file_code} - {os.getpid()} Filtering words from {in_file}")
    
    with fileinput.FileInput(in_file, inplace=True, backup='.bak') as file:
        for line in file:
            print(rare_regex.sub(' ', line), end='')
            
            
    print(f"{file_code} - {os.getpid()} end filtering {time.time()-start}")
    
def join_files(in_dir, out_path, file_name):
    files = [str(x) for x in Path(f'{in_dir}').glob('**/*.txt')]
    
    if os.path.isfile(f"{out_path}/joined_{file_name}.txt"):
        os.remove(f"{out_path}/joined_{file_name}.txt")
    
    with open(f"{out_path}/joined_{file_name}.txt", 'w+', encoding='utf-8') as out_file:
        for path in files:
            with open(path, 'r') as in_file:
                for line in in_file:
                    out_file.write(line)
                       
def run(in_file, out_path, split_len, threshold, custom=[]):
    freqs_dict = freqs(in_file, out_path)
    rare_regex = find_rare(freqs_dict, threshold, custom)
    splits = split(in_file, out_path, split_len)
    
    pool = multiprocessing.Pool(16) #use all available cores, otherwise specify the number you want as an argument
    i=0
    for path in splits:
        pool.apply_async(filter_rare, args=(path, i, rare_regex))
        i+=1
    pool.close()
    pool.join()
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    join_files(save_dir, out_path, file_name)

In [3]:
custom = ['e', '#x200b']

run('/home/edo/projects/usage_change_ITA/data/days_2019/processed/days_2019_sentencesL.txt', './test', 5000, 15)
run('/home/edo/projects/usage_change_ITA/data/days_2020/processed/days_2020_sentencesL.txt', './test', 5000, 15)

0 - 3449 Filtering words from test/days_2019_sentencesL.txt_parts/days_2019_sentencesL.txt_0.txt
1 - 3450 Filtering words from test/days_2019_sentencesL.txt_parts/days_2019_sentencesL.txt_1.txt
1 - 3450 end filtering 8641.696554899216
0 - 3449 end filtering 8719.27451634407
1 - 4076 Filtering words from test/days_2020_sentencesL.txt_parts/days_2020_sentencesL.txt_1.txt
2 - 4077 Filtering words from test/days_2020_sentencesL.txt_parts/days_2020_sentencesL.txt_0.txt
0 - 4075 Filtering words from test/days_2020_sentencesL.txt_parts/days_2020_sentencesL.txt_2.txt
0 - 4075 end filtering 16079.07299733162
1 - 4076 end filtering 16155.073775291443
2 - 4077 end filtering 16242.472791433334


In [7]:
test_regex = re.compile('[^a-zA-Z0-9]'+' villafalletto| bartolomeo')

filter_rare(
    './test/test.txt', 0,
    test_regex)

0 - 5062 Filtering words from ./test/test.txt
0 - 5062 end filtering 0.0017948150634765625


In [18]:
custom = ['e', '#x200b', 'deleted', 'removed']

run('./test/test.txt', './test', 10, 0, custom)

0 - 5844 Filtering words from test/test.txt_parts/test.txt_1.txt1 - 5845 Filtering words from test/test.txt_parts/test.txt_0.txt

1 - 5845 end filtering 0.014800786972045898
0 - 5844 end filtering 0.02779865264892578
