In [1]:
import logging
import time
import os
import shutil
import re
from nltk.corpus import stopwords
import stanza
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Process():
    def __init__(self, year):
        self.year = year

    def clean(self, user_stopwords):
        

        directory = f'./data/days_{self.year}/raw'

        file_list =[]

        for entry in os.scandir(directory):
            if entry.path.endswith("_{}.txt".format(self.year)):
                file_list.append(entry)

        if os.path.isdir(f'./data/days_{self.year}/clean'):
            shutil.rmtree(f'./data/days_{self.year}/clean')
            os.mkdir(f'./data/days_{self.year}/clean')

        init_time = time.time()
        i=0
        for file in file_list:
            start_time = time.time()

            file_name = str(os.path.basename(file)).strip('.txt')
            print('Processing file: {}'.format(str(os.path.basename(file))))
            with open(file,'r', encoding = 'utf-8') as in_file:
                text = in_file.readlines()
                
                stopw = ' '+' |'.join(stopwords.words('italian'))
                stopw_regex = re.compile
                for line in text:
                    
                    #line = line.lower() #lowercasing
                    line = re.sub(r'http/S+', ' URL', line) #Removes URLs
                    #line = re.sub(r'[!"”$%&()*+,-.//:;<=>?@/[/]^_`{|}~…»•😀❤️😀🤔🤣😭😅🙄😉—]', '', line)   #Removes [] and other special chars
                    #line = re.sub(r'#x200B', '', line)
                    z=1

                    line=line.split()
                    for word in line:
                        if word in stopwords.words('italian') or word in user_stopwords:
                            line.remove(word)
                        if len(word)>24:
                            line.insert(line.index(word),'LONG')
                            line.remove(word)
                        if z%3000==0: #provides buffer for RAM usage
                            text.insert(z, '/n/n')
                        z+=1   

                    line = ' '.join(line)
                    with open('./data/days_{}/clean/{}.txt'.format(self.year, file_name), 'a+', encoding='utf-8') as out_file:
                        print (line, file=out_file)
                    
            i+=1
            elapsed_time = time.time() - start_time
            print ('Processed {} in {} - {}/{}'.format(file_name+'.txt', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), i, len(file_list)))
            
        total_time = time.time() - init_time
        print ('Processed all {} of {} files in {}'.format(i, len(file_list), time.strftime("%H:%M:%S", time.gmtime(total_time))))

    def lemmatize(self):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        directory = f'./data/days_{self.year}/clean'
        processed_dir= f'./data/days_{self.year}/processed/processed_days'

        file_list = []
        done_list = []

        for entry in os.scandir(directory):
            if entry.path.endswith("_clean.txt"):
                file_list.append(entry)

        for entry in os.scandir(processed_dir):
            if entry.path.endswith("sentencesL.txt"):
                entry_name = str(os.path.basename(entry).strip('sentencesL.txt')[:-1])
                done_list.append(entry_name)

        for done_entry in done_list:
            done_path = done_entry+'.txt'
            for entry in file_list:
                if entry.path.endswith(done_path):
                    file_list.remove(entry)

        error = False
        for f in done_list:
            if f in file_list:
                error = True
                print(f, 'IN LIST! ERROR!')
        if error:
            quit()
        else:
            print("Lists OK!")

        print (len(file_list))

        if len(file_list) == 0:
            print('ALL DONE!')
        else:
            init_time = time.time()

            nlp = stanza.Pipeline(lang='it', processors='tokenize, mwt, pos, lemma', tokenize_batch_size=25, mwt_batch_size=25, pos_batch_size=250, lemma_batch_size=25, lemma_max_dec_len=25, logging_level='WARN', use_gpu=True)
            i=0
            for file in file_list:
                with open(file, 'r', encoding='utf-8') as in_file:

                    start_time = time.time()
                    file_name = str(os.path.basename(file)[:-4])
                    print('Processing file: {}'.format(file_name+'.txt'))
                    text = in_file.read()
                    doc = nlp(text)

                    with open('./data/days_{}/processed/processed_days/{}_sentencesL.txt'.format(self.year, file_name), 'w+', encoding = 'utf-8') as out_file:

                        for s in doc.sentences:
                            s_list = []
                            for w in s.words:
                                s_list.append(w.text)

                            print (" ".join(s_list), file=out_file)
                            print ("\n", file=out_file)

                i+=1
                elapsed_time = time.time() - start_time

                print ('Processed {} in {} - {}/{}'.format(file_name+'.txt', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), i, len(file_list)))
                clear_output(wait=True)
                
            total_time = time.time() - init_time
            print ('Processed all {} of {} files in {}'.format(i, len(file_list), time.strftime("%H:%M:%S", time.gmtime(total_time))))

    def preprocess(self, user_stopwords):
        self.clean(user_stopwords)
        self.lemmatize()
        self.join()
        
    def join(self):
        directory = f'./data/days_{self.year}/processed'

        file_list =[]

        for entry in os.scandir(directory):
            if entry.path.endswith("{}_clean_sentencesL.txt".format(self.year)):
                file_list.append(entry)

        if os.path.isfile(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt'):
            os.remove(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt')

        i=0
        for file in file_list:

            with open(file, 'r', encoding='utf-8') as small_file:
                text = small_file.read().lower()
                with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt', 'a+', encoding='utf-8') as big_file:
                    print (text, file=big_file)
                    print ('/n/n', file=big_file)
            i+=1
            print ('Processed {} of {} files'.format(i, len(file_list)))
   
    def freqs(self):

        freqs = {}

        with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt', 'r') as in_file:
            text = in_file.readlines()

            for line in text:
                line = line.split()
                for word in line:
                    if word not in freqs.keys():
                        freqs[word] = 1
                    else:
                        freqs[word] += 1

        freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
        df = pd.DataFrame(freqs.items(), columns=['Lemma', 'Count'])

        with open(f"./data/days_{self.year}/freqs_{self.year}.tsv", 'w+') as out_file:
            df.to_csv(out_file, sep='\t')

        return freqs
    
    def chunks(self, lst, n):
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
    
    def prepostprocess(self):
        with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt', 'r') as in_file:
            text = in_file.readlines()
    
        if os.path.isfile(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_prepost.txt'):
            os.remove(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_prepost.txt')
            
            with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_prepost.txt', 'a+') as out_file:
                for line in text:
                    for chunk in self.chunks(line.split(), 25):
                        print(' '.join(chunk), file=out_file)

    def filter_rare(self, threshold):
        
        freqs = self.freqs()
        
        #unique_file = open('./data/parole.txt', 'r')
        #unique = unique_file.readlines()

        rare = [key for key,value in freqs.items() if value <= threshold]
                
        print(f"Filtering {len(rare)} words with freq lower than {threshold}")
        
        with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_prepost.txt', 'r') as in_file:
            text = in_file.readlines()
            
            if os.path.isfile(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_post.txt'):
                os.remove(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_post.txt')
                
            with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL_post.txt', 'a+') as out_file:
                i = 0
                for line in text:
                    start = time.time()
                    print(f"Filtering line {i} out of {len(text)}")
                    for word in line.split():
                        if (word in rare): # and (word not in unique): # Too long
                            line = re.sub(word, '', line)
                    print(line, file=out_file)
                    end = time.time()
                    print(time.strftime("%H:%M:%S", time.gmtime(end-start)))
                    if i%10 == 0:
                        clear_output(wait=True)
                        print(f"Filtering {len(rare)} words with freq lower than {threshold}")
                    i+=1
                    
    def postprocess(self, rare_threshold):
        self.filter_rare(rare_threshold)   
    

In [13]:
import spacy
import os

def lemmatize(in_file):
    load_model = spacy.load('it_core_news_lg', disable=['parser', 'ner'])
    
    file_name = os.path.basename(in_file)
    
    with open(in_file, 'r') as in_file, open(f'{file_name}.lemmas', 'w+') as out_file:
        text = in_file.read()
        doc = load_model(text)
        lemmas = ' '.join([token.lemma_ for token in doc])
        print(lemmas, file=out_file)

In [14]:
lemmatize('./test/raw.txt')

In [69]:
import random

In [84]:
class LemmaTester:
    def __init__(self, year, n_examples, n_iterations):
        self.year = year
        self.n_examples = n_examples
        self.n_iterations = n_iterations
        
    def load_text(self):
        with open(f'./data/days_{self.year}/processed/days_{self.year}_sentencesL.txt', 'r') as in_file:
            text = in_file.read()
            text = text.split()
            
            return text
    def collect_examples(self, text):
        indexes = random.sample(range(len(text)), self.n_examples)
        examples = [text[index] for index in indexes]
        
        return examples
        
    def compare(self):
        with open('./data/parole.txt', 'r') as dictionary:
            dictionary = dictionary.readlines()
            unique = [word.strip('\n') for word in dictionary]
            text = self.load_text()
            examples = self.collect_examples(text)
            results = []
            errors = []
            for example in examples:
                if example in unique:
                    results.append(1)
                else:
                    results.append(0)
                    with open('./data/lemma_errors.txt', 'a+') as errors:
                        print(example, file=errors)
            perc = (sum(results)/self.n_examples)*100
        
        return perc
  
    def run(self):
                
        seeds = random.sample(range(9999), self.n_iterations)
        iterations = []
        
        for seed in seeds:
            random.seed(seed)
            iterations.append(self.compare())
        
        average = (sum(iterations)/len(iterations))
        
        print (f"Values for {self.n_iterations}:")
        print(iterations)
        print (f"Average value: {average}")
        
       

In [86]:
LemmaTester(2019, 100, 10).run()

Values for 10:
[93.0, 88.0, 90.0, 91.0, 86.0, 89.0, 84.0, 88.0, 95.0, 87.0]
Average value: 89.1


In [6]:
years=['test']
user_stopwords = ['bla', 'peró', 'cosí', 'bon', 'ehm', 'bhe', 'uh', 'aaaah', 'aaah', 'ahhhh', 'noooo', 'ahahahaha',
                  'hahahah', 'mmh', 'mhh', 'ahahha', 'mmmh', 'ah', 'ehh', 'eheh', 'ohi', 'ehe']


for year in years:
    #Process(year).prepostprocess()
    Process(year).postprocess(5)


Filtering 880 words with freq lower than 5


FileNotFoundError: [Errno 2] No such file or directory: './data/days_test/processed/days_test_sentencesL_prepost.txt'

In [82]:
df = pd.read_csv('.data/days_2019/freqs_2019.txt', sep='\t')
print(df.head())

sns.lineplot(data=df, x='Unnamed: 0', y='Count')


   Unnamed: 0   Lemma   Count
0           0  essere  450312
1           1      il  249948
2           2    fare  233447
3           3   avere  221198
4           4  potere  130037


TypeError: unsupported operand type(s) for /: 'str' and 'int'