In [1]:
from collections import Counter
from pathlib import Path
import json
import os

import csv
import numpy as np
import pandas as pd
import spacy

import helpers as hp

In [2]:
nlp = spacy.load('en_md')

In [3]:
def fss(tokens, pos, neg):
    """
    Calcula Financial Stability Sentiment index basado en tokens de un documento.

    Parameters
    ----------
    tokens: list or iterable
    pos: set
    neg: set

    Yields
    ------
    float
    """
    fd = Counter(tokens)

    emopos = sum(c for w, c in fd.items() if w in pos)
    emoneg = sum(c for w, c in fd.items() if w in neg)
    total = sum(fd.values())

    emodiff = emoneg - emopos

    try:
        score = (emodiff / total)
    except ZeroDivisionError:
        score = np.nan
    except Exception as e:
        score = np.nan
        logging.info('ERROR inesperado calculando FSS: {}'.format(e))

    return score

In [4]:
dir_corpus = '/Users/tombito/Downloads/estabilidad/reportes/en/corpus/'
dir_output = 'isref'
os.makedirs(dir_output, exist_ok=True)

pathstops = '/Users/tombito/Dropbox/datasets/wordlists/stopwords/stopwords.xlsx'
wdlist = '/Users/tombito/Dropbox/datasets/wordlists/fss.json'

In [5]:
with open(wdlist, encoding='utf-8') as f:
    diction = json.load(f, encoding='utf-8')

positive = diction.get('positive')
negative = diction.get('negative')

In [6]:
stops = hp.load_stopwords(pathstops, 'english', col='word')
tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'ADP','AUX', 'DET', 'PRON']
ents = ['PER', 'ORG']

extra = dict(stopwords=stops, postags=tags, entities=ents, ) 
# opcional stemmer=SnowballStemmer('spanish')
# habiendo importado from nltk.stem import SnowballStemmer

In [7]:
ngramas = hp.model_ngrams(hp.iter_sentences(dir_corpus, nlp, extra))

In [8]:
docnames = hp.get_docnames(dir_corpus)

In [9]:
scores = [fss(words, positive, negative) 
          for words in hp.iter_documents(ngramas, dir_corpus, nlp, extra)]

In [10]:
isref = pd.DataFrame({'doc': docnames, 'score': scores})
isref.to_csv(os.path.join(dir_output, 'isref.csv'), index=False, encoding='utf-8')

In [11]:
isref

Unnamed: 0,doc,score
0,2002-12-01,-0.003219
1,2003-07-01,-0.008579
