In [3]:
# 第8章: 機械学習

# 70. データの入手・整形
import random

with open('rt-polarity.neg', encoding='UTF-8', errors='ignore') as input_file:
    negatives = ['-1 ' + line for line in input_file]

with open('rt-polarity.pos', encoding='UTF-8', errors='ignore') as input_file:
    positives = ['+1 ' + line for line in input_file]

sentiments = negatives + positives
random.shuffle(sentiments)

with open('sentiment.txt', mode='w') as output_file:
    for sentiment in sentiments: 
        output_file.write(sentiment)

In [2]:
# 71. ストップワード

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def is_stopword(word):
    return word in stop_words

print(is_stopword('they'))
print(is_stopword('great'))

True
False


In [6]:
# 72. 素性抽出

from sklearn.feature_extraction.text import CountVectorizer
from stemming.porter2 import stem

corpus = [sentiment[3:].replace('\n', '') for sentiment in sentiments]
label = [int(sentiment[0:2]) for sentiment in sentiments]

def preprocess(doc):
    return (stem(word) for word in doc.split() if not is_stopword(word))

vectorizer = CountVectorizer(min_df=1, stop_words='english', analyzer= preprocess)

X = vectorizer.fit_transform(corpus)
print(X)

  (0, 14)	2
  (0, 43)	4
  (0, 7776)	1
  (0, 23)	3
  (0, 10299)	1
  (0, 12166)	1
  (0, 6219)	1
  (0, 10316)	1
  (0, 2119)	1
  (0, 13367)	1
  (0, 5964)	1
  (0, 10486)	1
  (0, 2570)	1
  (0, 13909)	1
  (0, 5452)	1
  (0, 2040)	1
  (0, 1667)	1
  (0, 13605)	1
  (0, 7504)	1
  (0, 6066)	1
  (1, 43)	1
  (1, 8407)	1
  (1, 1641)	1
  (1, 11470)	1
  (1, 2793)	1
  :	:
  (10660, 13739)	1
  (10660, 10024)	1
  (10660, 7477)	1
  (10660, 8514)	1
  (10660, 2018)	1
  (10660, 12648)	1
  (10660, 25)	1
  (10660, 14507)	1
  (10660, 2354)	1
  (10660, 6091)	1
  (10660, 1557)	1
  (10660, 8058)	1
  (10660, 1978)	1
  (10660, 2423)	1
  (10660, 6350)	1
  (10660, 2957)	1
  (10660, 3088)	1
  (10660, 10533)	1
  (10660, 13139)	1
  (10660, 3910)	1
  (10661, 43)	1
  (10661, 13376)	1
  (10661, 9707)	1
  (10661, 4302)	1
  (10661, 3601)	1


In [10]:
# 73. 学習

from sklearn.linear_model import LogisticRegression

y = label

logreg = LogisticRegression(C=1e5)
print(logreg.fit(X,y))

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


In [12]:
# 74. 予測

# 負例
print(
    logreg.predict(
        vectorizer.transform(
            ["' . . . mafia , rap stars and hood rats butt their ugly heads in a regurgitation of cinematic violence that gives brutal birth to an unlikely , but likable , hero . '"]
        )
    )
)

# 正例
print(
    logreg.predict(
        vectorizer.transform(
            ['the metaphors are provocative , but too often , the viewer is left puzzled by the mechanics of the delivery . ']
        )
    )
)

[1]
[-1]


In [35]:
# 75. 素性の重み

import numpy as np

sorted_indices = np.array(logreg.coef_)[0].argsort()

# 最高のトップ10
print('highest 10 : ' + ', '.join([key for key, value in vectorizer.vocabulary_.items() if (value in sorted_indices[-10:]) ]))

# 最低のトップ10
print('lowest 10 : ' + ', '.join([key for key, value in vectorizer.vocabulary_.items() if (value in sorted_indices[:10]) ]))


highest 10 : teamwork, tenth, todd, boost, tape, award-worthi, cozi, liber, nightclub, [jackie]
lowest 10 : choppi, prettiest, makhmalbaf, har, flaccid, disclosur, metaphys, text, expend, limp


In [42]:
"{}%".format(logreg.score(vectorizer.transform(["I love everyone in terms of teamwork"]), [1]) * 100)

'100.0%'

In [49]:
print(len([elm for elm in y if elm == 1]))

print(len([elm for elm in y if elm == -1]))

print(len(y))

5331
5331
10662
