In [79]:
import logging
import time
import os
import shutil
import re
from nltk.corpus import stopwords
import stanza
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [49]:
def clean(year, user_stopwords):   
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    directory = f'./data/days_{year}/raw'

    file_list =[]

    for entry in os.scandir(directory):
        if entry.path.endswith("_{}.txt".format(year)):
            file_list.append(entry)
    
    if os.path.isdir(f'./data/days_{year}/clean'):
        shutil.rmtree(f'./data/days_{year}/clean')
        os.mkdir(f'./data/days_{year}/clean')

    init_time = time.time()
    i=0
    for file in file_list:
        start_time = time.time()

        file_name = str(os.path.basename(file)).strip('.txt')
        print('Processing file: {}'.format(str(os.path.basename(file))))
        with open(file,'r', encoding = 'utf-8') as in_file:
            text = in_file.readlines()
            for line in text:
                line = line.lower() #lowercasing
                line = re.sub(r'http/S+', ' URL', line) #Removes URLs
                line = re.sub(r'[!"”$%&()*+,-.//:;<=>?@/[/]^_`{|}~…»•😀❤️😀🤔🤣😭😅🙄😉—]', '', line)   #Removes [] and other special chars
                line = re.sub(r'#x200B', '', line)
                z=1
                
                line=line.split()
                for word in line:
                    if word in stopwords.words('italian') or word in user_stopwords:
                        line.remove(word)
                    if len(word)>24:
                        line.insert(line.index(word),'LONG')
                        line.remove(word)
                    if z%3000==0: #provides buffer for RAM usage
                        text.insert(z, '/n/n')
                    z+=1   

                line = ' '.join(line)
                with open('./data/days_{}/clean/{}.txt'.format(year, file_name), 'a+', encoding='utf-8') as out_file:
                    print (line, file=out_file)

        i+=1
        elapsed_time = time.time() - start_time
        print ('Processed {} in {} - {}/{}'.format(file_name+'.txt', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), i, len(file_list)))

    total_time = time.time() - init_time
    print ('Processed all {} of {} files in {}'.format(i, len(file_list), time.strftime("%H:%M:%S", time.gmtime(total_time))))

def preprocess(year):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    directory = f'./data/days_{year}/clean'
    processed_dir= f'./data/days_{year}/processed/processed_days'

    file_list = []
    done_list = []

    for entry in os.scandir(directory):
        if entry.path.endswith("_clean.txt"):
            file_list.append(entry)

    for entry in os.scandir(processed_dir):
        if entry.path.endswith("sentencesL.txt"):
            entry_name = str(os.path.basename(entry).strip('sentencesL.txt')[:-1])
            done_list.append(entry_name)

    for done_entry in done_list:
        done_path = done_entry+'.txt'
        for entry in file_list:
            if entry.path.endswith(done_path):
                file_list.remove(entry)

    error = False
    for f in done_list:
        if f in file_list:
            error = True
            print(f, 'IN LIST! ERROR!')
    if error:
        quit()
    else:
        print("Lists OK!")

    print (len(file_list))

    if len(file_list) == 0:
        print('ALL DONE!')
    else:
        init_time = time.time()

        nlp = stanza.Pipeline(lang='it', processors='tokenize, mwt, pos, lemma', tokenize_batch_size=25, mwt_batch_size=25, pos_batch_size=250, lemma_batch_size=25, lemma_max_dec_len=25, logging_level='WARN', use_gpu=True)
        i=0
        for file in file_list:
            with open(file, 'r', encoding='utf-8') as in_file:

                start_time = time.time()
                file_name = str(os.path.basename(file)[:-4])
                print('Processing file: {}'.format(file_name+'.txt'))
                text = in_file.read()
                doc = nlp(text)

                with open('./data/days_{}/processed/processed_days/{}_sentencesL.txt'.format(year, file_name), 'w+', encoding = 'utf-8') as out_file:

                    for s in doc.sentences:
                        s_list = []
                        for w in s.words:
                            s_list.append(w.text)

                        print (" ".join(s_list), file=out_file) 

            i+=1
            elapsed_time = time.time() - start_time

            print ('Processed {} in {} - {}/{}'.format(file_name+'.txt', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), i, len(file_list)))
            
        total_time = time.time() - init_time
        print ('Processed all {} of {} files in {}'.format(i, len(file_list), time.strftime("%H:%M:%S", time.gmtime(total_time))))

def join(year):
    directory = f'./data/days_{year}/processed'

    file_list =[]

    for entry in os.scandir(directory):
        if entry.path.endswith("{}_clean_sentencesL.txt".format(year)):
            file_list.append(entry)
            
    if os.path.isfile(f'./data/days_{year}/processed/days_{year}_sentencesL.txt'):
        os.remove(f'./data/days_{year}/processed/days_{year}_sentencesL.txt')
        
    i=0
    for file in file_list:

        with open(file, 'r', encoding='utf-8') as small_file:
            text = small_file.read().lower()
            with open(f'./data/days_{year}/processed/days_{year}_sentencesL.txt', 'a+', encoding='utf-8') as big_file:
                print (text, file=big_file)
                print ('/n/n', file=big_file)
        i+=1
        print ('Processed {} of {} files'.format(i, len(file_list)))
   

In [70]:
year=2020
user_stopwords = ['bla', 'peró', 'cosí', 'bon', 'ehm', 'bhe', 'uh', 'aaaah', 'aaah', 'ahhhh', 'noooo', 'ahahahaha',
                  'hahahah', 'mmh', 'mhh', 'ahahha', 'mmmh', 'ah', 'ehh', 'eheh', 'ohi', 'ehe']

#clean(year, user_stopwords)
#preprocess(year)
join(year)


Processed 1 of 306 files
Processed 2 of 306 files
Processed 3 of 306 files
Processed 4 of 306 files
Processed 5 of 306 files
Processed 6 of 306 files
Processed 7 of 306 files
Processed 8 of 306 files
Processed 9 of 306 files
Processed 10 of 306 files
Processed 11 of 306 files
Processed 12 of 306 files
Processed 13 of 306 files
Processed 14 of 306 files
Processed 15 of 306 files
Processed 16 of 306 files
Processed 17 of 306 files
Processed 18 of 306 files
Processed 19 of 306 files
Processed 20 of 306 files
Processed 21 of 306 files
Processed 22 of 306 files
Processed 23 of 306 files
Processed 24 of 306 files
Processed 25 of 306 files
Processed 26 of 306 files
Processed 27 of 306 files
Processed 28 of 306 files
Processed 29 of 306 files
Processed 30 of 306 files
Processed 31 of 306 files
Processed 32 of 306 files
Processed 33 of 306 files
Processed 34 of 306 files
Processed 35 of 306 files
Processed 36 of 306 files
Processed 37 of 306 files
Processed 38 of 306 files
Processed 39 of 306 f

In [87]:
def freqs(year):
    
    freqs = {}
    
    with open(f'./data/days_{year}/processed/days_{year}_sentencesL.txt', 'r') as in_file:
        text = in_file.readlines()
        
        for line in text:
            line = line.split()
            for word in line:
                if word not in freqs.keys():
                    freqs[word] = 1
                else:
                    freqs[word] += 1
    
    freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
    df = pd.DataFrame(freqs.items(), columns=['Lemma', 'Count'])
    
    with open(f"./data/days_{year}/freqs_{year}.tsv", 'w+') as out_file:
        df.to_csv(out_file, sep='\t')
    
    return freqs

def filter_rare(freqs, threshold):
    rare = []
    for key,value in freqs.items():
        if value <= threshold:
            rare.append(key)
    print(len(rare))
    print(rare[:50])
    return rare

In [88]:
rare1 = filter_rare(freqs(2020), 1)


224069
['40/50k', 'accontentatico', 'dovente', 'noordbrabant', 'orettacol', 'necessario50', 'trasferimere', 'trusttechnologies', 'iva1281', 'phoning', 'logicista', 'ragazzzino', 'mhanon', 'galeramo', 'salvando', 'quantomidispiacetigiuro', 'avvicinereste', 'tramita', 'ororificenza', 'coivolgere', 'cz80010', 'poutpourro', 'magaloff', 'grammophon/', 'costringile', 'dicoquesta', 'merdaviglioso', '1ml€', 'lisittare', '5yeare', 'respinto', 'leggertelere', 'aululario', 'comunicazionequasi', 'maieppure', 'ossessivare', 'regigigas', 'tralquattroelcinque', 'akkusativ', 'dativ', 'vaselinaa', 'inccool8', 'epigrafo', 'eeeehforso', 'fontecedro', 'disgustorama', 'nove9', 'medieliceo', 'ribattevo', 'aywa']


In [82]:
df = pd.read_csv('.data/days_2019/freqs_2019.txt', sep='\t')
print(df.head())

sns.lineplot(data=df, x='Unnamed: 0', y='Count')


   Unnamed: 0   Lemma   Count
0           0  essere  450312
1           1      il  249948
2           2    fare  233447
3           3   avere  221198
4           4  potere  130037


TypeError: unsupported operand type(s) for /: 'str' and 'int'