This notebook is a sample code with Japanese comments.

# 3.3 Titanicの先へ行く③！　テキストデータに触れてみよう

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'text': ['I like kaggle very much',
                            'I do not like kaggle',
                            'I do really love machine learning']})
df

Unnamed: 0,text
0,I like kaggle very much
1,I do not like kaggle
2,I do really love machine learning


# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bag = vectorizer.fit_transform(df['text'])
bag.toarray()

array([[0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
       [1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0]])

In [5]:
print(vectorizer.vocabulary_)

{'i': 1, 'like': 4, 'kaggle': 2, 'very': 10, 'much': 7, 'do': 0, 'not': 8, 'really': 9, 'love': 5, 'machine': 6, 'learning': 3}


# TF-IDF

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
transformer = TfidfTransformer()

tf = vectorizer.fit_transform(df['text'])
tfidf = transformer.fit_transform(tf)
print(tfidf.toarray())

[[0.         0.31544415 0.40619178 0.         0.40619178 0.
  0.         0.53409337 0.         0.         0.53409337]
 [0.43306685 0.33631504 0.43306685 0.         0.43306685 0.
  0.         0.         0.56943086 0.         0.        ]
 [0.34261996 0.26607496 0.         0.45050407 0.         0.45050407
  0.45050407 0.         0.         0.45050407 0.        ]]


In [7]:
print(vectorizer.vocabulary_)

{'i': 1, 'like': 4, 'kaggle': 2, 'very': 10, 'much': 7, 'do': 0, 'not': 8, 'really': 9, 'love': 5, 'machine': 6, 'learning': 3}


# Word2vec

In [9]:
from gensim.models import word2vec


sentences = [d.split() for d in df['text']]
model = word2vec.Word2Vec(sentences, vector_size=10, min_count=1, window=2, seed=7)

In [10]:
model.wv['like']

array([ 0.01650858,  0.01069946,  0.00188946,  0.09910005,  0.06153275,
        0.05853238,  0.04005488,  0.02443584, -0.03179482,  0.09779203],
      dtype=float32)

In [11]:
model.wv.most_similar('like')

[('I', 0.42540043592453003),
 ('machine', 0.36355969309806824),
 ('not', 0.311229407787323),
 ('kaggle', -0.004140517208725214),
 ('much', -0.11530755460262299),
 ('do', -0.1529018133878708),
 ('love', -0.25542783737182617),
 ('really', -0.4161785840988159),
 ('learning', -0.44330504536628723),
 ('very', -0.44338396191596985)]

In [12]:
df['text'][0].split()

['I', 'like', 'kaggle', 'very', 'much']

In [13]:
import numpy as np


wordvec = np.array([model.wv[word] for word in df['text'][0].split()])
wordvec

array([[ 0.08898099,  0.02501909,  0.03683598,  0.07944275,  0.01565849,
         0.05513714,  0.0667302 , -0.05495857, -0.08889369, -0.03996675],
       [ 0.01650858,  0.01069946,  0.00188946,  0.09910005,  0.06153275,
         0.05853238,  0.04005488,  0.02443584, -0.03179482,  0.09779203],
       [ 0.06329302, -0.03939352, -0.03167932, -0.04431488,  0.04389417,
        -0.04902608,  0.09809195, -0.01098474, -0.00437022,  0.00090965],
       [ 0.03720424, -0.02774719,  0.02864924,  0.01963681, -0.07835456,
        -0.08814968,  0.03203132, -0.02247364,  0.01966591, -0.03539274],
       [-0.09157717,  0.04835419, -0.00529734, -0.08170088, -0.05110302,
         0.00822875,  0.04535742,  0.00155444,  0.02258943,  0.07426786]],
      dtype=float32)

In [14]:
np.mean(wordvec, axis=0)

array([ 0.02288193,  0.00338641,  0.0060796 ,  0.01443277, -0.00167443,
       -0.0030555 ,  0.05645315, -0.01248533, -0.01656068,  0.01952201],
      dtype=float32)

In [15]:
np.max(wordvec, axis=0)

array([0.08898099, 0.04835419, 0.03683598, 0.09910005, 0.06153275,
       0.05853238, 0.09809195, 0.02443584, 0.02258943, 0.09779203],
      dtype=float32)

In [17]:
from gensim.models import word2vec


sentences = word2vec.Text8Corpus('./../input/ja.text8')
model = word2vec.Word2Vec(sentences, vector_size=200)
model.wv.most_similar(['経済'])

[('財政', 0.7146179676055908),
 ('政策', 0.6889548301696777),
 ('社会', 0.6625900268554688),
 ('産業', 0.6559704542160034),
 ('金融', 0.6396461129188538),
 ('政治', 0.636995255947113),
 ('対外', 0.6332561373710632),
 ('農業', 0.626223623752594),
 ('格差', 0.6227728128433228),
 ('資本', 0.5943987369537354)]