In [1]:
import spacy
import shutil
import os
import random
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def split(in_file, out_path, split_len):
    
    file_name = os.path.basename(in_file)
    save_dir = f'{out_path}/{file_name}_parts'
    
    if os.path.isdir(save_dir):
        shutil.rmtree(save_dir)
    
    os.mkdir(save_dir)
       
    with open(in_file, 'r') as in_file:
        
        data = in_file.readlines()
        num_lines = len(data)
    
        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            if len(text_data) == split_len:
                save_path = f'{save_dir}/{file_name}_{file_count}.txt'
                
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))

                text_data = []
                file_count += 1
    
    paths = [str(x) for x in Path(f'{out_path}/{file_name}_parts').glob('**/*.txt')]
    return paths

def lemmatize(in_file, out_dir):
    load_model = spacy.load('it_core_news_lg', disable=['parser', 'ner'])
    
    file_name = os.path.basename(in_file)
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    
    with open(in_file, 'r') as in_file, open(f'{out_dir}/{file_name}.lemmas', 'w+') as out_file:
        text = in_file.read()
        doc = load_model(text)
        lemmas = ' '.join([token.lemma_ for token in doc])
        print(lemmas, file=out_file)
    
def join_files(in_dir, out_path, file_name):
    files = [str(x) for x in Path(f'{in_dir}').glob('**/*.lemmas')]
    
    if os.path.isfile(f"{out_path}/joined_{file_name}.txt"):
        os.remove(f"{out_path}/joined_{file_name}.txt")
    
    with open(f"{out_path}/joined_{file_name}.txt", 'w+', encoding='utf-8') as out_file:
        for path in files:
            with open(path, 'r') as in_file:
                for line in in_file:
                    out_file.write(line)
                os.remove(path)
                
def run(in_file, out_path, file_name, split_len):
    s = split(in_file, out_path, split_len)
    
    for p in s:
        lemmatize(p, out_path)
        print(p)
    join_files(out_path, out_path, file_name)

In [5]:
run('../data/2020_clean/days_2020/days_2020_clean.txt', '../data/2020_clean/days_2020/', '2020', 5000)

../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_358.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_108.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_99.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_235.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_342.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_43.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_339.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_163.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_79.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_357.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_176.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_351.txt
../data

../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_347.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_97.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_39.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_341.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_353.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_333.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_256.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_230.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_211.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_212.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_77.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_277.txt
../data

../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_173.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_219.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_179.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_150.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_37.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_278.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_161.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_157.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_175.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_318.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_29.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_282.txt
../dat

../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_17.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_316.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_28.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_294.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_201.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_133.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_258.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_122.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_308.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_355.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_0.txt
../data/2020_clean/days_2020/days_2020_clean.txt_parts/days_2020_clean.txt_332.txt
../data/

In [2]:
class LemmaTester:
    def __init__(self, year, n_examples, n_iterations):
        self.year = year
        self.n_examples = n_examples
        self.n_iterations = n_iterations
        
    def load_text(self):
        with open(f'../data/{self.year}_clean/days_{self.year}/days_{self.year}_spacy.txt', 'r') as in_file:
            text = in_file.read()
            text = text.split()
            
            return text
    def collect_examples(self, text):
        indexes = random.sample(range(len(text)), self.n_examples)
        examples = [text[index] for index in indexes]
        
        return examples
        
    def compare(self):
        with open('../data/parole.txt', 'r') as dictionary:
            dictionary = dictionary.readlines()
            unique = [word.strip('\n') for word in dictionary]
            text = self.load_text()
            examples = self.collect_examples(text)
            results = []
            errors = []
            for example in examples:
                if example in unique:
                    results.append(1)
                else:
                    results.append(0)
                    with open('../data/lemma_errors.txt', 'a+') as errors:
                        print(example, file=errors)
            perc = (sum(results)/self.n_examples)*100
        
        return perc
  
    def run(self):
                
        seeds = random.sample(range(9999), self.n_iterations)
        iterations = []
        
        for seed in seeds:
            random.seed(seed)
            iterations.append(self.compare())
        
        average = (sum(iterations)/len(iterations))
        
        print (f"Values for {self.n_iterations}:")
        print(iterations)
        print (f"Average value: {average}")
        
       

In [6]:
LemmaTester(2019, 100, 10).run()

Values for 10:
[85.0, 92.0, 87.0, 87.0, 81.0, 82.0, 90.0, 82.0, 89.0, 89.0]
Average value: 86.4
