In [9]:
!pip install nltk



In [14]:
import os
import nltk
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize

# 1. nltk_data 폴더를 현재 작업 폴더에 생성
os.makedirs('nltk_data/tokenizers', exist_ok=True)

# 2. punkt 다운로드 (현재 폴더 안)
nltk.download('punkt', download_dir='nltk_data')

# 3. PunktSentenceTokenizer를 직접 로드
punkt_path = os.path.join(os.getcwd(), 'nltk_data', 'tokenizers', 'punkt', 'english.pickle')
tokenizer = PunktSentenceTokenizer(punkt_path)

# 4. 예제 텍스트
text_sample = '''The Matrix is everywhere its all around us, here even in this room.
You can see it out your window or on your television.
You feel it when you go to work, or go to church or pay your taxes.'''

# 5. 문장 단위 토큰화
sentences = tokenizer.tokenize(text_sample)

# 6. 결과 확인
print(type(sentences), len(sentences))
print(sentences)


<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
import os
import nltk
from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer

# 1️⃣ nltk_data/tokenizers/punkt 폴더 만들기
os.makedirs('nltk_data/tokenizers', exist_ok=True)

# 2️⃣ punkt 다운로드 (현재 폴더 안)
nltk.download('punkt', download_dir='nltk_data')

# 3️⃣ PunktSentenceTokenizer 직접 로드
punkt_path = os.path.join(os.getcwd(), 'nltk_data', 'tokenizers', 'punkt', 'english.pickle')
sent_tokenizer = PunktSentenceTokenizer(punkt_path)

# 4️⃣ TreebankWordTokenizer 인스턴스 생성
word_tokenizer = TreebankWordTokenizer()

# 5️⃣ 예제 텍스트
text_sample = '''The Matrix is everywhere its all around us, here even in this room.
You can see it out your window or on your television.
You feel it when you go to work, or go to church or pay your taxes.'''

# 6️⃣ 문장 단위 토큰화
sentences = sent_tokenizer.tokenize(text_sample)
print("문장 단위 토큰화:")
print(type(sentences), len(sentences))
print(sentences)
print("-" * 50)

# 7️⃣ 단어 단위 토큰화 (문장별)
words_per_sentence = [word_tokenizer.tokenize(sent) for sent in sentences]
print("단어 단위 토큰화 (문장별):")
print(type(words_per_sentence), len(words_per_sentence))
print(words_per_sentence)
print("-" * 50)

# 8️⃣ 단어 단위 토큰화 (전체)
all_words = [word for sent in words_per_sentence for word in sent]
print("단어 단위 토큰화 (전체):")
print(type(all_words), len(all_words))
print(all_words)


문장 단위 토큰화:
<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']
--------------------------------------------------
단어 단위 토큰화 (문장별):
<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]
--------------------------------------------------
단어 단위 토큰화 (전체):
<class 'list'> 45
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.', 'You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.', 'You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',

[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
import os
import nltk
from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer

# 1️⃣ nltk_data/tokenizers/punkt 폴더 만들기
os.makedirs('nltk_data/tokenizers', exist_ok=True)

# 2️⃣ punkt 다운로드 (현재 폴더 안)
nltk.download('punkt', download_dir='nltk_data')

# 3️⃣ tokenizer 인스턴스 생성
punkt_path = os.path.join(os.getcwd(), 'nltk_data', 'tokenizers', 'punkt', 'english.pickle')
sent_tokenizer = PunktSentenceTokenizer(punkt_path)
word_tokenizer = TreebankWordTokenizer()

# 4️⃣ 문장별 단어 토큰화 함수
def tokenize_text(text):
    # 문장별 분리
    sentences = sent_tokenizer.tokenize(text)
    # 문장별 단어 토큰화
    word_tokens = [word_tokenizer.tokenize(sentence) for sentence in sentences]
    return word_tokens

# 5️⃣ 예제 텍스트
text_sample = '''The Matrix is everywhere its all around us, here even in this room.
You can see it out your window or on your television.
You feel it when you go to work, or go to church or pay your taxes.'''

# 6️⃣ 함수 사용
word_tokens = tokenize_text(text_sample)

# 7️⃣ 결과 확인
print(type(word_tokens), len(word_tokens))
print(word_tokens)


<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [26]:
print('영어 stop words 개수:', len(nltk.corpus.stopwords.words('english'))) 
print(nltk.corpus.stopwords.words('english')[ :20])

영어 stop words 개수: 198
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been']


In [28]:
import nltk
stopwords = nltk.corpus.stopwords.words('english') 
all_tokens = []
# 위 예제에서 3개의 문장별로 얻은 word_tokens list에 대해 스톱 워드를 제거하는 반복문 
for sentence in word_tokens:
    filtered_words=[]
    # 개별 문장별로 토큰화된 문장 list에 대해 스톱 워드를 제거하는 반복문
    for word in sentence:
        # 소문자로 모두 변환합니다.
        word = word.lower()
        # 토큰화된 개별 단어가 스톱 워드의 단어에 포함되지 않으면 word_tokens에 추가 
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


In [31]:
from nltk.stem import LancasterStemmer 
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked')) 
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused')) 
print(stemmer.stem('happier'), stemmer.stem('happiest')) 
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [34]:
from nltk.stem import WordNetLemmatizer 
import nltk 
nltk.download('wordnet')
lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v')) 
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...


amuse amuse amuse
happy happy
fancy fancy


In [2]:
import numpy as np 
dense = np.array( [ [3, 0, 1 ], [0, 2, 0] ] ) 

In [3]:
from scipy import sparse
# 0이 아닌 데이터 추출 
data = np.array([3, 1, 2])
 # 행 위치와 열 위치를 각각 배열로 생성 
row_pos = np.array([0, 0, 1 ]) 
col_pos = np.array([0, 2, 1 ])
 # sparse 패키지의 coojnatrix를 이용해 COO 형식으로 희소 행렬 생성 
sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [4]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

In [11]:
from scipy import sparse
dense2 =np.array([[0, 0, 1, 0, 0, 5],
 [1, 4, 0, 3, 2, 5],
 [0, 6, 0, 3, 0, 0],
 [2, 0, 0, 0, 0, 0],
 [0, 0, 0, 7, 0, 8],
 [1, 0, 0, 0, 0, 0]])
# 0이 아닌 데이터 추출
data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1 ])
# 행 위치와 열 위치를 각각 array로 생성
row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])
col_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])
# COO 형식으로 변환
sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))
# 행 위치 배열의 고유한 값의 시작 위치 인덱스를 배열로 생성 
row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])
# CSR 형식으로 변환
sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))
print('COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인') 
print(sparse_coo.toarray())
print('CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인') 
print(sparse_csr.toarray())

COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


In [13]:
dense3 = np.array([[0, 0, 1, 0, 0, 5],
 [1, 4, 0, 3, 2, 5],
 [0, 6, 0, 3, 0, 0],
 [2, 0, 0, 0, 0, 0],
 [0, 0, 0, 7, 0, 8], 
 [1, 0, 0, 0, 0, 0]])
coo = sparse.coo_matrix(dense3) 
csr = sparse.csr_matrix(dense3)

In [15]:
import pandas as pd
review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep="\t", quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [16]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [17]:
import re
# <br> html 태그는 replace 함수로 공백으로 변환
review_df['review'] = review_df['review'].str.replace('<br />', '' )
# 파이썬의 정규 표현식 모듈인 re를 이용해 영어 문자열이 아닌 문자는 모두 공백으로 변환
review_df['review'] = review_df['review'].apply( lambda x : re.sub("[^a-zA—Z]", " ", x) )

In [18]:
from sklearn.model_selection import train_test_split
class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

In [20]:
X_train, X_test, y_train, y_test= train_test_split(feature_df, class_df, test_size=0.3, 
                                                   random_state=156)
X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
# 스톱 워드는 English, ngram은 (1, 2)로 설정해 CountVectorization 수행.
# LogisticRegression의 C는 10으로 설정.
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2) )),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))])
# Pipeline 객체를 이용해 fit(), predictO로 학습/예측 수행. predict_j)roba()는 roc_auc 때문에 수행. 
pipeline.fit(X_train['review'], y_train) 
pred = pipeline.predict(X_test['review']) 
pred_probs = pipeline.predict_proba(X_test['review'])[ :, 1 ]
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), 
roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8821, ROC-AUC는 0.9483


In [23]:
# 스톱 워드는 english, filtering, ngram은 (1, 2)로 설정해 TF-IDF 벡터화 수행.
# LogisticRegression의 C는 10으로 설정.
pipeline = Pipeline([
 ('tfidf_vect',TfidfVectorizer(stop_words='english', ngram_range=(1, 2) )),
 ('lr_clf', LogisticRegression(solver='liblinear', C=10))])
pipeline.fit(X_train['review'], y_train) 
pred = pipeline.predict(X_test['review']) 
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1 ]
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), 
                                                 roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8901, ROC-AUC는 0.9578


In [24]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptr

True

In [25]:
from nltk.corpus import wordnet as wn
term = 'present'
# 'present'라는 단어로 wordnet의 synsets 생성. 
synsets = wn.synsets(term) 
print('synsets() 반환 type :', type(synsets)) 
print('synsets() 반환 값 개수:', len(synsets)) 
print('synsets() 반환 값 :', synsets)

synsets() 반환 type : <class 'list'>
synsets() 반환 값 개수: 18
synsets() 반환 값 : [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [28]:
for synset in synsets :
    print('##### Synset name :', synset.name(), '#####')
    print('POS :', synset.lexname())
    print('Definition:',synset.definition()) 
    print('Lemmas:', synset.lemma_names())

##### Synset name : present.n.01 #####
POS : noun.time
Definition: the period of time that is happening now; any continuous stretch of time including the moment of speech
Lemmas: ['present', 'nowadays']
##### Synset name : present.n.02 #####
POS : noun.possession
Definition: something presented as a gift
Lemmas: ['present']
##### Synset name : present.n.03 #####
POS : noun.communication
Definition: a verb tense that expresses actions or states at the time of speaking
Lemmas: ['present', 'present_tense']
##### Synset name : show.v.01 #####
POS : verb.perception
Definition: give an exhibition of to an interested audience
Lemmas: ['show', 'demo', 'exhibit', 'present', 'demonstrate']
##### Synset name : present.v.02 #####
POS : verb.communication
Definition: bring forward and present to the mind
Lemmas: ['present', 'represent', 'lay_out']
##### Synset name : stage.v.01 #####
POS : verb.creation
Definition: perform (a play), especially on a stage
Lemmas: ['stage', 'present', 'represent']
##

In [29]:
 # synset 객체를 단어별로 생성합니다. 
tree = wn.synset('tree.n.01') 
lion = wn.synset('lion.n.01') 
tiger = wn.synset('tiger.n.02') 
cat = wn.synset('cat.n.01') 
dog = wn.synset('dog.n.01')
entities = [tree, lion, tiger, cat, dog] 
similarities = []
entity_names = [entity.name().split('.')[0] for entity in entities]
# 단어별 synset을 반복하면서 다른 단어의 synset과 유사도를 측정합니다. 
for entity in entities:
    similarity = [round(entity.path_similarity(compared_entity), 2)
            for compared_entity in entities]
    similarities.append(similarity)
# 개별 단어별 synset과 다른 단어의 synset과의 유사도를 DataFrame 형태로 저장합니다. 
similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names) 
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [30]:
import nltk
from nltk.corpus import sentiwordnet as swn
senti_synsets = list(swn.senti_synsets('slow'))
print('senti_synsets() 반환 type :', type(senti_synsets))
print('senti_synsets() 반환 값 개수:', len(senti_synsets))
print('senti_synsets() 반환 값 :', senti_synsets)

senti_synsets() 반환 type : <class 'list'>
senti_synsets() 반환 값 개수: 11
senti_synsets() 반환 값 : [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [32]:
import nltk
from nltk.corpus import sentiwordnet as swn
father = swn.senti_synset('father.n.01') 
print('father 긍정감성 지수:', father.pos_score())
print ('fat her 부정감성 지수:', father. neg_score ())
print('father 객관성 지수:', father.obj_score())
print('\n')
fabulous = swn.senti_synset('fabulous.a.01')
print('fabulous 긍정감성 지수:', fabulous.pos_score()) 
print( 'fabulous 부정감성 지수:', fabulous.neg_score())

father 긍정감성 지수: 0.0
fat her 부정감성 지수: 0.0
father 객관성 지수: 1.0


fabulous 긍정감성 지수: 0.875
fabulous 부정감성 지수: 0.125


In [33]:
from nltk.corpus import wordnet as wn
# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet 기반의 품사 Tag로 변환
def penn_to_wn(tag):
    if tag.startswith('3'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'): 
        return wn.VERB

In [40]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import sentiwordnet as swn 
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize, word_tokenize, pos_tag

def penn_to_wn(tag):
    if tag.startswith('N'):
        return wn.NOUN
    if tag.startswith('J'):
        return wn.ADJ
    if tag.startswith('R'):
        return wn.ADV
    return None

def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0
    lemmatizer = WordNetLemmatizer()

    for raw_sentence in sent_tokenize(text):
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue

            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())

            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1

    if tokens_count == 0:
        return 0

    return 1 if sentiment >= 0 else 0


In [37]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import sentiwordnet as swn 
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0 
    tokens_count = 0
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
    # NTLK 기반의 품사 태깅 문장 추출
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
    for word, tag in tagged_sentence:
        # WordNet 기반 품사 태깅과 어근 추출 
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            continue
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            continue

            #  어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wn.synsets(lemmam, pos=wn_tag) 
            if  not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산.
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score()) 
            tokens_count = 1
    if not tokens_count: 
        return 0
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환 
    if sentiment >= 0 : 
        return 1
    return 0

In [42]:
review_df['preds'] = review_df['review'].apply( lambda x : swn_polarity(x) ) 
y_target = review_df['sentiment'].values 
preds = review_df['preds'].values

In [45]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score 
import numpy as np
print(confusion_matrix(y_target, preds))
print("정확도:", np.round(accuracy_score(y_target, preds), 4))
print("정밀도:", np.round(precision_score(y_target, preds), 4))
print("재현율:", np.round(recall_score(y_target, preds), 4))


[[7357 5143]
 [3368 9132]]
정확도: 0.6596
정밀도: 0.6397
재현율: 0.7306


In [46]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0]) 
print(senti_scores)

{'neg': 0.113, 'neu': 0.758, 'pos': 0.128, 'compound': 0.3042}


In [47]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer() 
    scores = analyzer.polarity_scores(review)
    # compound 값에 기반해 threshold 입력값보다 크면 1, 그렇지 않으면 0을 반환 
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

# apply lambda 식을 이용해 레코드별로 vader_polarity( )를 수행하고 결과를 'vader_preds'에 저장 
review_df['vader_preds'] = review_df['review'].apply( lambda x : vader_polarity(x, 0.1) ) 
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

print(confusion_matrix(y_target, vader_preds))
print("정확도:", np.round(accuracy_score(y_target, vader_preds),4))

[[ 6704  5796]
 [ 1882 10618]]
정확도: 0.6929
