In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#### Word Synset 및 Sentiwordnet SentiSysnet 클래스

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
from nltk.corpus import wordnet

In [12]:
term = 'fly'
synsets = wordnet.synsets(term)

In [13]:
type(synsets), len(synsets)

(list, 20)

In [14]:
for synset in synsets:
  print(f'##### name: {synset.name()} #####')
  print('POS: ', synset.lexname())
  print('정의: ', synset.definition())
  print('표제어: ', synset.lemma_names())

##### name: fly.n.01 #####
POS:  noun.animal
정의:  two-winged insects characterized by active flight
표제어:  ['fly']
##### name: tent-fly.n.01 #####
POS:  noun.artifact
정의:  flap consisting of a piece of canvas that can be drawn back to provide entrance to a tent
표제어:  ['tent-fly', 'rainfly', 'fly_sheet', 'fly', 'tent_flap']
##### name: fly.n.03 #####
POS:  noun.artifact
정의:  an opening in a garment that is closed by a zipper or by buttons concealed under a fold of cloth
표제어:  ['fly', 'fly_front']
##### name: fly.n.04 #####
POS:  noun.act
정의:  (baseball) a hit that flies up in the air
표제어:  ['fly', 'fly_ball']
##### name: fly.n.05 #####
POS:  noun.artifact
정의:  fisherman's lure consisting of a fishhook decorated to look like an insect
표제어:  ['fly']
##### name: fly.v.01 #####
POS:  verb.motion
정의:  travel through the air; be airborne
표제어:  ['fly', 'wing']
##### name: fly.v.02 #####
POS:  verb.motion
정의:  move quickly or suddenly
표제어:  ['fly']
##### name: fly.v.03 #####
POS:  verb.motion
정의

- 어휘 간의 유사도

In [16]:
# 단어, 품사를 모를 경우 : synsets()로 확인
for synset in wordnet.synsets('tiger'):
  print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [19]:
# 단어, 품사를 아는 경우 : synset()로 확인

tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [20]:
# 단어간의 유사도
tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [21]:
# 5개 단어간의 유사도
similarities=[]
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
  similarity = [entity.path_similarity(another) for another in entities]
  similarities.append(similarity)

In [23]:
df = pd.DataFrame(similarities, columns=["tree", "lion", "tiger", "cat", "dog"], index=["tree", "lion", "tiger", "cat", "dog"])
df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 클래스

In [24]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [26]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets('slow'))
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [30]:
# father 단어의 긍정/부정/객관성 지수
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [31]:
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [35]:
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

In [34]:
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [36]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

- 감성지수 계산

In [37]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [39]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [40]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [41]:
def penn_to_wordnet(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  if tag.startswith('N'):
    return wordnet.NOUN
  if tag.startswith('R'):
    return wordnet.ADV
  if tag.startswith('V'):
    return wordnet.VERB  

In [42]:
for word, pos in pos_tag(word_list):
  print(word, penn_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r
. None


- Sentence로부터 감성 지수를 계산하는 과정

In [43]:
word_list = [word for word in word_tokenize(sentence) if len(word)> 2]
word_list

['good', 'see', 'you', 'again']

In [44]:
for word, pos in pos_tag(word_list):
  wn_tag=penn_to_wordnet(pos)
  if wn_tag:                          # None이 아닌 n, a, r, v
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag)) #단어, 품사를 synsets에 저장
    synset = synsets[0]
    print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [46]:
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag=penn_to_wordnet(pos)
  if wn_tag:                          # None이 아닌 n, a, r, v
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag)) #단어, 품사를 synsets에 저장
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()

sentiment #문장 전체가 긍정인지, 부정인지 계산

0.75

In [47]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [48]:
# 표제어 추출까지 고려
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag=penn_to_wordnet(pos)
  if wn_tag:
    lemma = lemmatizer.lemmatize(word, wn_tag)                          
    synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()

sentiment #문장 전체가 긍정인지, 부정인지 계산

0.75

- Document에서 감성지수를 계산하는 과정 및 함수

In [51]:
from nltk import sent_tokenize
document = '''
This is a movie made purely to satisfy the fans and there should be no doubt about that. No Way Home, in my opinion, is even better than Homecoming and Far From Home, and pretty much one of the best MCU movies of all time.
It's a simple story, but the execution is fantastic."
Even the smallest of surprises have a huge impact, and I could feel that in the theatre as I joined several other Spider-Man fans cheer out for both heroes and villains.
The action sequences were brilliant; seeing them in 3D is totally worth the price of admission.
Every actor delivered a believable, realistic performance, and especially our lead actor Tom Holland.
The visual effects too were top notch and the editing was stupendous.
Two and a half hours flew by real quick while watching this popcorn action entertainer.
It won't be fair to reveal anything, so here I conclude my review, and recommend you to check out this new world of Spidey-ness on the big screen and in 3D.
And once you've seen it, please don't spoil it for others, just like you won't want it spoiled for yourself.
'''

In [53]:
sentiment = 0.0
for sentence in sent_tokenize(document):
  word_list = [word for word in word_tokenize(sentence) if len(word)> 2]
  for word, pos in pos_tag(word_list):
    wn_tag=penn_to_wordnet(pos)
    if wn_tag:
      lemma = lemmatizer.lemmatize(word, wn_tag)                          
      synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
      if not synsets:
        print(word)
        continue
      synset = synsets[0]
      sentiment += synset.pos_score() - synset.neg_score()

print('긍정' if sentiment >= 0 else '부정')

Homecoming
From
MCU
Spider-Man
lead
popcorn
n't
anything
Spidey-ness
've
n't
others
n't
긍정


In [60]:
def swn_polarity(text):
  lemmatizer=WordNetLemmatizer()
  sentiment = 0.0
  for sentence in sent_tokenize(text):
    word_list = [word for word in word_tokenize(sentence) if len(word)> 2]
    for word, pos in pos_tag(word_list):
      wn_tag=penn_to_wordnet(pos)
      if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)                          
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        if not synsets:
          #print(word)
          continue
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

  return 1 if sentiment >= 0 else 0

- IMDB 영화평 감성분석

In [61]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)   # 3: QUOTE_NONE
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [62]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace('<br />',' ')

In [63]:
# 구둣점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()

In [64]:
df = df.head(10000)
%time df['pred']=df.review.apply(lambda x: swn_polarity(x))

CPU times: user 3min 14s, sys: 1.18 s, total: 3min 15s
Wall time: 3min 17s


In [65]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.6309

- VADER Lexicon을 이용한 감성 분석

In [67]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [68]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer=SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [69]:
def vader_polarity(document, threshold=0.1):
  score = senti_analyzer.polarity_scores(document)
  return 1 if score['compound'] >= threshold else 0

In [70]:
%time df['vader'] = df.review.apply(lambda x:vader_polarity(x, 0.1))

CPU times: user 30.7 s, sys: 291 ms, total: 31 s
Wall time: 31.1 s


In [71]:
accuracy_score(df.sentiment, df.vader)

0.6997