# Sentiment Lexicons

In [1]:
%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print(eKonf.__version__)

0.1.31+16.g2b3b245.dirty


## Instantiating a lexicon class

In [3]:
config_group='model/sentiment/lexicon=mpko_lex'
cfg = eKonf.compose(config_group=config_group)
cfg.analyze.ngram_distiance_tolerance = 1
lexicon = eKonf.instantiate(cfg)
print(lexicon)

Lexicon
----------
name: mpko_tone_lex
fullname: Korean Monetary Policy Tone Lexicon - Lexical Approach
source: https://github.com/entelecheia/eKoNLPy
lang: ko
lexicon_features: ['label', 'polarity', 'intensity', 'pos_score', 'neg_score']
lexicon_path: /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/mpko/mp_polarity_lexicon_lex.parquet
word_key: word
ignore_pos: False



In [5]:
text = "투기를 억제하기 위해 금리를 인상해야 한다."
tokens = ['투기', '억제', '금리', '인상', '인상', '투기;억제', '금리;인상', '견조', '호황', '힘들']

features = lexicon.analyze(tokens, features=['label', 'polarity'])
print(features)

{}


In [8]:
lexicon._lexicon

Unnamed: 0,word,_word,_ngram_len,label,polarity,intensity,pos_score,neg_score
11420,완화/nn;유지/nn;완화/nn;점차/ma;줄/vv,"(완화/nn, 유지/nn, 완화/nn, 점차/ma, 줄/vv)",5,1,0.003044,1.106251,13.640993,12.330827
21777,변동성/nn;확대/nn;가능성/nn;배제/nn;없/va,"(변동성/nn, 확대/nn, 가능성/nn, 배제/nn, 없/va)",5,-1,-0.346022,2.167654,11.193045,24.262653
3144,기대/nn;인플레이션/nn;억제/nn;금리/nn;인상/nn,"(기대/nn, 인플레이션/nn, 억제/nn, 금리/nn, 인상/nn)",5,1,0.180585,1.548528,17.188444,11.099858
3145,물가/nn;안정/nn;대책/nn;금리/nn;인상/nn,"(물가/nn, 안정/nn, 대책/nn, 금리/nn, 인상/nn)",5,1,0.180549,1.548422,17.312386,11.180667
16761,경제/nn;성장/nn;둔화/nn;우려/nn;높/va,"(경제/nn, 성장/nn, 둔화/nn, 우려/nn, 높/va)",5,-1,-0.064909,1.241776,14.222117,17.660677
...,...,...,...,...,...,...,...,...
18199,부실/nn,"(부실/nn,)",1,-1,-0.099467,1.325330,9.532942,12.634295
18336,마이너스/nn,"(마이너스/nn,)",1,-1,-0.103087,1.334443,10.736146,14.326770
19352,지탱/nn,"(지탱/nn,)",1,-1,-0.137586,1.425028,13.638922,19.435847
19420,우려/nn,"(우려/nn,)",1,-1,-0.140655,1.433427,13.797743,19.778053


In [6]:
config_group='model/sentiment/lexicon=kosac_polarity'
cfg = eKonf.compose(config_group=config_group)
cfg.ignore_pos = True
cfg.analyze.ngram_distiance_tolerance = 1
lexicon = eKonf.instantiate(cfg)
tokens = ['투기', '억제', '금리', '인상', '인상', '투기;억제', '금리;인상', '견조', '호황', '힘들']

lexicon.analyze(tokens, features=['max.value', 'max.prop'])

Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/kosac/polarity.parquet


{'힘들': {'max.value': 'NEG', 'max.prop': 0.75, 'count': 1},
 '인상': {'max.value': 'POS', 'max.prop': 0.666666667, 'count': 2}}

In [7]:
from ekorpkit import eKonf

config_group='model/sentiment/lexicon=lm'
cfg = eKonf.compose(config_group=config_group)
# eKonf.pprint(cfg)
lexicon = eKonf.instantiate(cfg)
# print(lexicon)

Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet


In [8]:
tokens = ["Bad", "Fraud", "Good","Good","Good", "Sound", "uncertain", "beat", "wrong"]
lexicon.analyze(tokens, features=['Positive', 'Negative', 'Uncertainty'])

{'bad': {'Positive': 0, 'Negative': 2009, 'Uncertainty': 0, 'count': 1},
 'beat': {'Positive': 0, 'Negative': 0, 'Uncertainty': 0, 'count': 1},
 'fraud': {'Positive': 0, 'Negative': 2009, 'Uncertainty': 0, 'count': 1},
 'good': {'Positive': 2009, 'Negative': 0, 'Uncertainty': 0, 'count': 3},
 'sound': {'Positive': 0, 'Negative': 0, 'Uncertainty': 0, 'count': 1},
 'uncertain': {'Positive': 0, 'Negative': 0, 'Uncertainty': 2009, 'count': 1},
 'wrong': {'Positive': 0, 'Negative': 2009, 'Uncertainty': 0, 'count': 1}}

In [1]:
from ekorpkit import eKonf

config_group='model/sentiment=lm'
cfg = eKonf.compose(config_group=config_group)
cfg.preprocessor.tokenizer.nltk.lemmatize = True
# eKonf.pprint(cfg)
lmsa = eKonf.instantiate(cfg)

tokens = ["Fraud", "Good","Good","Good", "Sound", "uncertain", "beat", "wrong"]

lmsa.predict(tokens)

Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet


{'polarity': 0.199999960000008,
 'subjectivity': 0.6249999218750099,
 'label': 'neutral'}

In [2]:
from pprint import pprint
text = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
tokens = lmsa.tokenize(text)
print(tokens)
pprint(lmsa.analyze(tokens, features=['Negative', 'Positive']))
print(lmsa.predict(text))
text = "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005."
print(lmsa.predict(text))


['Beyond', 'the', 'improved', 'voice', 'capability', ',', 'customer', 'now', 'have', 'a', 'streamlined', 'way', 'to', 'comply', 'with', 'recall', 'and', 'other', 'traceability', 'requirement', ',', 'provide', 'them', 'with', 'a', 'competitive', 'advantage', '.']
{'advantage': {'Negative': 0, 'Positive': 2009, 'count': 1},
 'and': {'Negative': 0, 'Positive': 0, 'count': 1},
 'beyond': {'Negative': 0, 'Positive': 0, 'count': 1},
 'capability': {'Negative': 0, 'Positive': 0, 'count': 1},
 'competitive': {'Negative': 0, 'Positive': 0, 'count': 1},
 'comply': {'Negative': 0, 'Positive': 0, 'count': 1},
 'customer': {'Negative': 0, 'Positive': 0, 'count': 1},
 'have': {'Negative': 0, 'Positive': 0, 'count': 1},
 'improved': {'Negative': 0, 'Positive': 2009, 'count': 1},
 'now': {'Negative': 0, 'Positive': 0, 'count': 1},
 'other': {'Negative': 0, 'Positive': 0, 'count': 1},
 'provide': {'Negative': 0, 'Positive': 0, 'count': 1},
 'recall': {'Negative': 2009, 'Positive': 0, 'count': 1},
 'req

In [16]:
config_group='model/sentiment/lexicon=hiv4'
cfg = eKonf.compose(config_group=config_group)
lexicon = eKonf.instantiate(cfg)

tokens = ["Bad", "Fraud", "Good", "Sound", "uncertain", "beat", "wrong", 'ability']
features = lexicon.analyze(tokens, features=['Positiv', 'Negativ'])
features

Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet


{'ability': {'Positiv': 'Positiv', 'Negativ': None, 'count': 1},
 'bad': {'Positiv': None, 'Negativ': 'Negativ', 'count': 1},
 'fraud': {'Positiv': None, 'Negativ': 'Negativ', 'count': 1},
 'uncertain': {'Positiv': None, 'Negativ': None, 'count': 1}}