In [1]:
## VERSAO FINAL , GERA TODOS EM UM MESMO ARQUIVO E JÁ ORDENADOS PELA DATA.
## VOCE PODE INTERROMPER O CÓDIGO E EXECUTA-LO DE NOVO QUE ELE CONTINUARÁ O TRABALHO DE ONDE PAROU 

import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from openpyxl import Workbook, load_workbook
from pdfminer.high_level import extract_text
import pyphen
from openpyxl.utils import get_column_letter

# Inicializando variáveis e downloads necessários
nltk.download('punkt')
nltk.download('stopwords')

url = 'https://drive.google.com/u/0/uc?id=17CmUZM9hGUdGYjCXcjQLyybjTrcjrhik&export=download'
loughran_mcdonald_dict = pd.read_csv(url)

# Definindo as colunas do dicionário Loughran Mcdonald Sentiment Index
word_column = 'Word'
category_columns = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Strong_Modal', 'Weak_Modal', 'Syllables']

dic = pyphen.Pyphen(lang='en')

# Definindo função de análise de sentimento, que usa como parâmetros o texto tokenizado e o dicionário da URL

def sentiment_analysis(text, dictionary):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if not word.lower() in stopwords.words('english')]
    results = {category: 0 for category in category_columns}
    for word in tokens:
        if word.upper() in dictionary[word_column].values:
            for category in category_columns:
                if dictionary.loc[dictionary[word_column] == word.upper(), category].values[0] > 0:
                    results[category] += 1
    return results

# Contagem de sílabas e de palavras 

def count_syllables(words):
    return sum([len(dic.inserted(word).split('-')) for word in words])

def count_words_sents_syllables(text):
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    syllables = count_syllables(tokens)
    return len(tokens), len(sentences), syllables

def calculate_fki(word_count, sent_count, syllable_count):
    try:
        FKI = 0.39 * (word_count / sent_count) + 11.8 * (syllable_count / word_count) - 15.59
        return FKI
    except ZeroDivisionError:
        return 0

# Definindo diretórios para salvar o arquivo excel a ser gerado 
    
base_directory = os.getcwd()
countries_directory = os.path.join(base_directory, 'PAISES_OCDE')

excel_directory = base_directory
excel_file = 'LMSI_OCDE_AGRUPADO.xlsx'
excel_path = os.path.join(excel_directory, excel_file)

# Carregando ou criando a planilha Excel
try:
    wb = load_workbook(excel_path)
    ws = wb.active
except FileNotFoundError:
    wb = Workbook()
    ws = wb.active
    ws.append(['Country', 'Filename'] + category_columns + ['Word Count', 'Sentence Count', 'Syllable Count', 'LMSI', 'FKI'])

processed_entries = [(ws['A' + str(row)].value, ws['B' + str(row)].value) for row in range(2, ws.max_row + 1)]



for root, dirs, files in os.walk(countries_directory):
    country_name = os.path.basename(root)
    # Ordenando os arquivos antes do processamento
    files = sorted([f for f in files if f.endswith('.pdf') and (country_name, f) not in processed_entries])
    for filename in files:
        file_path = os.path.join(root, filename)
        text = extract_text(file_path)
        results = sentiment_analysis(text, loughran_mcdonald_dict)
        word_count, sent_count, syllable_count = count_words_sents_syllables(text)
        fki_value = calculate_fki(word_count, sent_count, syllable_count)
        lmsi = results['Positive'] / (results['Positive'] + results['Negative']) if (results['Positive'] + results['Negative']) > 0 else 0
        ws.append([country_name, filename] + [results[category] for category in category_columns] + [word_count, sent_count, syllable_count, lmsi, fki_value])
        wb.save(excel_path)  # Salva após cada PDF ser processado
        print(f"Results for {country_name}/{filename} saved to: {excel_path}")



KeyboardInterrupt

