In [1]:
from collections import Counter
from pathlib import Path
import json
import os

import numpy as np
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
import spacy

import helpers as hp

In [2]:
nlp = spacy.load('en_md')

In [3]:
def fss(tokens, pos, neg):
    """
    Calcula Financial Stability Sentiment index basado en tokens de un documento.

    Parameters
    ----------
    tokens: list or iterable
    pos: set
    neg: set

    Yields
    ------
    float
    """
    fd = Counter(tokens)

    emopos = sum(c for w, c in fd.items() if w in pos)
    emoneg = sum(c for w, c in fd.items() if w in neg)
    total = sum(fd.values())

    emodiff = emoneg - emopos

    try:
        score = (emodiff / total)
    except ZeroDivisionError:
        score = np.nan
    except Exception as e:
        score = np.nan
        logging.info('ERROR inesperado calculando FSS: {}'.format(e))

    return score

In [4]:
def score_doc(fpath, pos, neg, lang, other=None):
    """
    Calcula Financial Stability Sentiment index de un documento en fpath.
    
    Parameters
    ----------
    fpath: str or Path
    pos: list or set or iterable
    neg: list or set or iterable
    lang: spacy.lang
    other: dict, optional (stopwords, postags, entities, stemmer)
    
    Returns
    -------
    float
    """
    text = hp.read_text(fpath)
    doc = lang(text)
    
    words = []
    for tokens in hp.doc_sentences(doc, other):
        words.extend(tokens)
    
    return fss(words, pos, neg)

In [5]:
dir_docs = '/Users/tombito/Dropbox/datasets/banrep/fsr/reports/'
dir_corpus = os.path.join(dir_docs, 'corpus')

dir_output = os.path.join('isref', Path(dir_docs).name)
os.makedirs(dir_output, exist_ok=True)

In [6]:
wdlist = '/Users/tombito/Dropbox/datasets/wordlists/fss.json'
with open(wdlist, encoding='utf-8') as f:
    diction = json.load(f, encoding='utf-8')

positive = diction.get('positive')
negative = diction.get('negative')

In [7]:
pathstops = '/Users/tombito/Dropbox/datasets/wordlists/stopwords/stopwords.xlsx'
stops = hp.load_stopwords(pathstops, 'english', col='word')
#tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'ADP','AUX', 'DET', 'PRON']
ents = ['PER', 'ORG']

extra = dict(stopwords=stops, entities=ents, ) #postags=tags, 
# opcional stemmer=SnowballStemmer('spanish')
# habiendo importado from nltk.stem import SnowballStemmer

In [8]:
scores = []
for fpath in hp.ordered_filepaths(dir_corpus):
    result = {}
    score = score_doc(fpath, positive, negative, nlp, extra)
    result['score'] = score
    result['doc'] = fpath.stem
    scores.append(result)

In [9]:
isref = pd.DataFrame(scores)
isref.to_csv(os.path.join(dir_output, 'isref.csv'), index=False, encoding='utf-8')

In [11]:
# generar gráfica del ISREF
fechas = pd.to_datetime(isref['doc'], format='%Y-%m-%d')

axis=dict(
    showline=True,
    zeroline=True, 
    showgrid=True,
    gridcolor='#ffffff',
    automargin=True
)

trace = go.Scatter(x=fechas, y=isref['score'], 
                   line=dict(width=2, color='#b04553'), 
                   marker=dict(size=8, color='#b04553'), 
                   name='ISREF')

layout = dict(title='Sentimiento de Reportes de Estabilidad Financiera',
              width=800, height=600,
              xaxis=dict(axis, **dict(title='Fecha')), 
              yaxis=dict(axis, **dict(title='ISREF', hoverformat='.3f')),
              showlegend=False,
              autosize=True,
              plot_bgcolor='rgba(228, 222, 249, 0.65)'
             )

fig = dict(data=[trace], layout=layout)
filename = os.path.join(dir_output, 'isref.html')
cohfile = pyo.plot(fig, show_link=False, filename=filename)