# 비지도 학습 감성분석 - Lexicon 기반

In [1]:
import numpy as np
import pandas as pd
from google.colab import files
up = files.upload()

Saving labeledTrainData.tsv to labeledTrainData.tsv


In [3]:
import warnings
warnings.filterwarnings("ignore")

### Wordnet Synset 및 Sentiwordnet SentiSynset 클래스

In [41]:
import nltk
nltk.download("wordnet")
nltk.download('sentiwordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [7]:
from nltk.corpus import wordnet

In [26]:
term = "elite"
synsets = wordnet.synsets(term)

In [27]:
for synset in synsets:
  print(f"##### name: {synset.name()}")
  print(f"Pos: {synset.lexname()}")
  print(f"정의: {synset.definition()}")
  print(f"표제어: {synset.lemma_names()}")

##### name: elite.n.01
Pos: noun.group
정의: a group or class of persons enjoying superior intellectual or social or economic status
표제어: ['elite', 'elite_group']
##### name: elect.s.01
Pos: adj.all
정의: selected as the best
표제어: ['elect', 'elite']


- 어휘간의 유사도

In [30]:
# 단어, 품사를 모를 겨웅에는 synsets()으로 알아냄
for synset in wordnet.synsets("lion"):
  print(synset.name(), synset.definition())

lion.n.01 large gregarious predatory feline of Africa and India having a tawny coat with a shaggy mane in the male
lion.n.02 a celebrity who is lionized (much sought after)
leo.n.01 (astrology) a person who is born while the sun is in Leo
leo.n.03 the fifth sign of the zodiac; the sun is in this sign from about July 23 to August 22


In [35]:
# 단어, 품사를 아는 경우에는 synset()
tiger = wordnet.synset("tiger.n.02")
tree = wordnet.synset("tree.n.01")
cat = wordnet.synset("cat.n.01")
lion = wordnet.synset("lion.n.01")
dog = wordnet.synset("dog.n.01")

nltk.corpus.reader.wordnet.Synset

In [34]:
# 단어간의 유사도
tiger.path_similarity(lion),tiger.path_similarity(dog),tiger.path_similarity(tree),tiger.path_similarity(cat)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142, 0.25)

In [37]:
# 5개 단어간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
  similarity = [entity.path_similarity(another) for another in entities] 
  similarities.append(similarity)
similarities

[[1.0, 0.07142857142857142, 0.07142857142857142, 0.07692307692307693, 0.125],
 [0.07142857142857142, 1.0, 0.3333333333333333, 0.25, 0.16666666666666666],
 [0.07142857142857142, 0.3333333333333333, 1.0, 0.25, 0.16666666666666666],
 [0.07692307692307693, 0.25, 0.25, 1.0, 0.2],
 [0.125, 0.16666666666666666, 0.16666666666666666, 0.2, 1.0]]

In [39]:
df = pd.DataFrame(similarities, columns = ['tree', 'lion', 'tiger', 'cat', 'dog'],
             index = ['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 클래스


In [42]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets("slow"))

In [43]:
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [46]:
# father 단어의 긍정/부정/객관성 지수
father = sentiwordnet.senti_synset("father.n.01")
father.pos_score() , father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [48]:
# mother 단어의 긍정/부정/객관성 지수
mother = sentiwordnet.senti_synset("mother.n.01")
mother.pos_score() , mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

('n', 'a', 'r', 'v')

- 감성지수

In [60]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [61]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again"
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again']

In [63]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [69]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB')]

In [72]:
def penn_to_wordnet(pos):
  if pos.startswith("J"):
    return wordnet.ADJ
  if pos.startswith("V"):
    return wordnet.VERB
  if pos.startswith("R"):
    return wordnet.ADV
  if pos.startswith("N"):
    return wordnet.NOUN
       

In [73]:
for word, pos in pos_tag(word_list):
  print(word, penn_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r


- Sentence로부터 Senti_Synset 객체를 만드는 과정

In [74]:
sentence = "It's good to see you again"
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [82]:
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 "n", "a", "r", "v"
    synsets = list(sentiwordnet.senti_synsets(word,wn_tag))
    synset = synsets[0]
    print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [83]:
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 "n", "a", "r", "v"
    synsets = list(sentiwordnet.senti_synsets(word,wn_tag))
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()
sentiment

0.75

In [84]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [85]:
# 표제어 추출까지 고려
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 "n", "a", "r", "v"
    lemma = lemmatizer.lemmatize(word, wn_tag)
    synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()
sentiment

0.75

- 도큐먼트에서 감성지수를 계산사는 과정 및 함수

In [88]:
from nltk import sent_tokenize
document = """
          It is now past 1 PM and I just finished watching Francis Ford Coppola's "The Godfather".
          I should probably go to bed. It's late and tomorrow I have to wake up a bit early. But not early enough to postpone writing these lines.
          Now that I have seen it three times, the opportunity of sharing my thoughts and refreshed insights are too much of a good offer to sit on. So, bear with me.
          This film works so well because it takes place in an underworld in which we are so embedded that we do not even observe it. 
          Coppola puts us straight in the smack-dab center of what is, admittedly, a society made by criminals for criminals.
          It is also the reason why it's so welcoming. We are surrounded by its inhabitants--cold-blooded murderers, men who see crime like a 9 to 5 job masquerading as honorable men. 
          And I do mean men. From the outside, we would only witness the horrifying, disturbing manifestations of their well-thought out actions.
          But it goes even deeper than that. It all revolves around the Corleone family led by Don Vito Corleone (Marlon Brando).
          He is the most honest of these men, sitting right on the edge. But for people like him, who do not fully embrace this world, it's not easy. He avoids conflict until it is absolutely necessary.
          He is a man defined by moral principles. There is a scene at the beginning, in which, during his daughter's wedding day, one of his associates,
          Luca Brasi (Lenny Montana) practices his speech that he is going to give to the Don when he meets him. The scene with these two is funny and almost adorable.
          I could not help but sympathize both of them only to realize that I am feeling warmth for two mobsters. Not to even mention that Lenny Montana was an actual mob hit-man and that he was actually nervous as he said that line.
          The more I watched the more I realized just how incredibly complex and ruthless this society is and how it has the power to corrupt anyone to come in contact with it.
          The best example is Corleone's youngest son, Michael (Al Pacino).
          He returns home for his sister's wedding as a war hero dressed the part with his long-time girlfriend,
          Kay Adams (Diane Keaton). At first, he avoids this underworld, but necessity, first-hand exposure and just its sheer devilish appealing nature draws him in.
          As we get further in the film, the change is shocking and every outsider who ever got close to him is tainted in one way or another.
          If they survive it, they are drawn in as well as we are as viewers.

          Inside, Coppola exposes the family to us fully, with a bold personal approach and we witness every discussion, every methodically calculated choice.
          Crime is done simply because it is the nature of their business, and we are put on a chair alongside them, so we easily relate. For us, they are the good guys,
          the rival families are the bad guys. This is the greatest feat this film managed to pull off--set apart good guys and bad guys in a world filled with bad guys.

          This is a film of unmatched subtlety. No other movie sustains itself as good. No other film is done with such precision, attention and completeness.
          There are many layers which I probably missed and maybe will never notice.
          But I felt them. What director Francis Ford Coppola and his partner in crime (poor choice of words, sorry)
          Mario Puzo did is nothing short of a timeless piece of reference cinema whose influence is not based on reinventing the wheel, but rather perfecting it to the absolute maximum.
          Most masterpieces are remembered for their historical contributions. "Citizen Kane" brought the biggest step-up to the art form,
           the same things did "Gone With the Wind" or "2001: A Space Odyssey". "The Godfather" is one of the few films that will be remembered simply because they are that good and I cannot possibly imagine a greater achievement.
"""

In [92]:
sentiment = 0
for sentence in sent_tokenize(document):
  word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
  for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag: # None이 아닌 "n", "a", "r", "v"
      lemma = lemmatizer.lemmatize(word, wn_tag)
      synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
      if not synsets:
        continue
      synset = synsets[0]
      sentiment += synset.pos_score() - synset.neg_score()
sentiment
print("긍정" if sentiment >= 0 else "부정")

긍정


In [94]:
def swn_polarity(text):
  lemmatizer = WordNetLemmatizer()
  sentiment = 0
  for sentence in sent_tokenize(text):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
    for word, pos in pos_tag(word_list):
      wn_tag = penn_to_wordnet(pos)
      if wn_tag: # None이 아닌 "n", "a", "r", "v"
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        if not synsets:
          continue
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
  return 1 if sentiment > 0 else 0

In [98]:
swn_polarity(document)

1

- IMDB 영화평 감성분석

In [96]:
df = pd.read_csv("labeledTrainData.tsv", sep = "\t")
df.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


In [97]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace("<br />", " ")# 구둣점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
# 데이터프레임의 str메소드는 정규표현식을 지원함
df.review = df.review.str.replace("[^A-Za-z]", " ").str.strip()
df.review[0][:1000]

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [99]:
df = df.sample(10000).reset_index(drop = True)
df.shape

(10000, 3)

In [101]:
%time df["pred"] = df.review.apply(lambda x : swn_polarity(x))

CPU times: user 3min 36s, sys: 1.56 s, total: 3min 38s
Wall time: 3min 41s


In [102]:
df.head()

Unnamed: 0,id,sentiment,review,pred
0,3660_1,0,I saw this little magnum opus for the first ti...,0
1,2533_7,1,Horror Gods Boris Karloff and Bela Lugosi shou...,1
2,3536_1,0,I don t think I ve ever felt this let down by ...,1
3,179_3,0,Return to Cabin by the Lake just was lacki...,1
4,10803_3,0,OK so after watching this invigorating movie a...,1


In [103]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.6239

VADERLexicon 을 이용한 감성 분석

In [105]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [106]:
from nltk.sentiment import SentimentIntensityAnalyzer
senti_analyzer = SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

{'compound': 0.9029, 'neg': 0.091, 'neu': 0.806, 'pos': 0.103}

In [107]:
def vader_polarity(document, threshold = 0.1):
  score = senti_analyzer.polarity_scores(document)
  return 1 if score["compound"] > threshold else 0


In [108]:
%time df["vader"] = df.review.apply(lambda x : vader_polarity(x, 0.1))

CPU times: user 32.1 s, sys: 629 ms, total: 32.7 s
Wall time: 33 s


In [110]:
accuracy_score(df.sentiment, df["vader"])

0.6918