# Extracción de datos de los discursos

Primero, usaremos nltk para extraer todas las palabras mencionadas en todos los discursos

In [None]:
import os
import json

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction import DictVectorizer

import numpy as np

In [None]:
STOPWORDS = stopwords.words('spanish')
spanish_sentence_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
tokenizer = RegexpTokenizer(r'\w+')
dict_vectorizer = DictVectorizer()

Iteramos sobre los archivos .txt generados con parse_pdf.ipynb, identificamos cada párrafo, y usamos `spanish_sentence_tokenizes` para separar por oraciones. Con eso, después usamos `tokenizer` para filtrar por palabras

In [None]:
folder_path = '/opt/projects/mensaje_presidencial/textos/txt'
files = sorted(os.listdir(folder_path))

messages = []
all_counts = []

for file in files:
    if not file.startswith('.'):
        message = {}
        [year, president] = file.split('.')[0].split('_')
        message['year'] = year
        message['president'] = president
        txt_path = os.path.join(folder_path, file)
        with open(txt_path, 'r') as fopen:
            text = fopen.read()
            paragraphs = text.split("\n\n")
            
            all_sentences = []
            counts = {}

            for paragraph in paragraphs:
                sentences = spanish_sentence_tokenizer.tokenize(paragraph)
                all_sentences.extend(sentences)
                for sentence in sentences:
                    words = tokenizer.tokenize(sentence)
                    for word in words:
                        word = word.lower()
                        if not word in STOPWORDS and not word.isnumeric():
                            if word in counts:
                                counts[word] +=1
                            else:
                                counts[word] = 1  
            all_counts.append(counts)
            message['sentences'] = all_sentences
        messages.append(message)

Usamos `dict_vectorizer` para calcular cuantas veces una palabrar es mencionada en cada documento. Después filtramos sólo aquéllas que aparezcan más de 10 veces en total. Esto es arbitrario, pero nos ayuda a deshacernos de palabras que no son importantes para el análisis

In [None]:
count_matrix = dict_vectorizer.fit_transform(all_counts)
feature_names = dict_vectorizer.get_feature_names()

all_cnts = count_matrix.sum(axis=0).getA()[0]
greater_than = all_cnts > 10
filtered_cnts = count_matrix[:, greater_than]
filtered_features = np.array(feature_names)[greater_than]
filtered_cnts.shape

Ahora generamos un diccionario del tipo `{'palabra': [cnt1, cnt2, cnt3, ...]}` para que sea más fácil ocuparlo cuando hagamos la visualización

In [None]:
words_obj = {}
for idx_feature, feature in enumerate(filtered_features):
    words_obj[feature] = list(filtered_cnts[:, idx_feature].toarray().flatten())

Definimos los períodos presidenciales

In [None]:
presidents = [{'name': 'Patricio', 'surname': 'Aylwin', 'year_start': 1990, 'year_end': 1993, 'img': './img/Aylwin.jpg'},
              {'name': 'Eduardo', 'surname': 'Frei', 'year_start': 1994, 'year_end': 1999, 'img': './img/Frei.jpeg'},
              {'name': 'Ricardo', 'surname': 'Lagos', 'year_start': 2000, 'year_end': 2005, 'img': './img/Lagos.jpeg'},
              {'name': 'Michelle', 'surname': 'Bachelet', 'year_start': 2006, 'year_end': 2009, 'img': './img/Bachelet1.jpg'},
              {'name': 'Sebastián', 'surname': 'Piñera', 'year_start': 2010, 'year_end': 2013, 'img': './img/Pinera1.jpg'},
              {'name': 'Michelle', 'surname': 'Bachelet', 'year_start': 2014, 'year_end': 2017, 'img': './img/Bachelet2.jpg'},
              {'name': 'Sebastián', 'surname': 'Piñera', 'year_start': 2018, 'year_end': 2018, 'img': './img/Pinera2.jpg'}]

Juntamos toda esta información en una sola variable y la guardamos en la carpeta `data` que será ocupada luego

In [None]:
obj = {}
obj['years'] = list(range(1990, 2019))
obj['presidents'] = presidents
obj['terms'] = words_obj

folder_path = '/opt/projects/mensaje_presidencial/data'
file_path = os.path.join(folder_path, 'data.json')
with open(file_path, 'w', encoding='utf-8') as f:
    dumped = json.dumps(obj)
    f.write(str(dumped))
    f.close()