In [3]:
import spacy
import shutil
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def split(in_file, out_path, split_len):
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    if os.path.isdir(save_dir):
        shutil.rmtree(save_dir)
    
    os.mkdir(save_dir)
       
    with open(in_file, 'r') as in_file:
        
        data = in_file.readlines()
        num_lines = len(data)
    
        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            if len(text_data) == split_len:
                save_path = f'{save_dir}/{file_name}_{file_count}.txt'
                
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))

                text_data = []
                file_count += 1
    
    paths = [str(x) for x in Path(f'{out_path}/{file_name}_parts').glob('**/*.txt')]
    return paths

def lemmatize(in_file, out_dir):
    load_model = spacy.load('it_core_news_lg', disable=['parser', 'ner'])
    
    file_name = os.path.basename(in_file)
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    
    with open(in_file, 'r') as in_file, open(f'{out_dir}/{file_name}.lemmas', 'w+') as out_file:
        text = in_file.read()
        doc = load_model(text)
        lemmas = ' '.join([token.lemma_ for token in doc])
        print(lemmas, file=out_file)
    
def join_files(in_dir, out_path, file_name):
    files = [str(x) for x in Path(f'{in_dir}').glob('**/*.lemmas')]
    
    if os.path.isfile(f"{out_path}/joined_{file_name}.txt"):
        os.remove(f"{out_path}/joined_{file_name}.txt")
    
    with open(f"{out_path}/joined_{file_name}.txt", 'w+', encoding='utf-8') as out_file:
        for path in files:
            with open(path, 'r') as in_file:
                for line in in_file:
                    out_file.write(line)
                os.remove(path)
                
def run(in_file, out_path, file_name, split_len):
    s = split(in_file, out_path, split_len)
    
    for p in s:
        lemmatize(p, out_path)
    
    join_files(out_path, out_path, file_name)

In [11]:
run('./test/days_2020_clean.txt', './test', '2020', 5000)