In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

# 可視化モジュール
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

# 小数第３まで表示
%precision 3

'%.3f'

In [32]:
achievements = pd.read_csv('data/assessment_data_with_names.csv')
# achievements.head(10)

In [3]:
achievements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495034 entries, 0 to 495033
Data columns (total 3 columns):
condition    495034 non-null int64
item_desc    495034 non-null object
price        479387 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.3+ MB


In [4]:
data = np.array(achievements[['condition', 'item_desc']])
target = np.array(achievements['price'])
feature_names = ['condition', 'item_desc']
target_names = ["新品・未開封", "中古美品", "中古品・使用感あり", "目立つ傷がある", "故障品", "不明"]

In [5]:
from sklearn.datasets.base import Bunch
from sklearn.model_selection import  train_test_split

achievement_datasets = Bunch(
    data=data,
    target=target,
    feature_names=feature_names,
    target_names=target_names,
)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    achievement_datasets.data,
    achievement_datasets.target,
    random_state =100
)

In [7]:
# 実験済み：重いのでロードしない
# import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format('../../mecab/data/pre_trained/model.vec', binary=False)

In [8]:
# model.most_similar(positive=['日本', 'パリ'], negative=['東京'])

In [35]:
import MeCab

mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
mecab.parse('')#文字列がGCされる闇対策

'EOS\n'

In [36]:
def str_diff(orig, sub):
    return orig.replace(sub, '')

In [37]:
def try_parse(text):
    node = mecab.parseToNode(text)
    words = node.next.surface

    while node.next:
        prev_words = words
        words = node.next.surface
        feature = node.feature.split(",")

        print( "{:10.10s}|{:10s}|{:10.10s}|{:10.10s}".format(str_diff(prev_words, words), feature[0], feature[6], feature[7]) )

        node = node.next

In [38]:
text = "国境の長いトンネルを抜けると雪国であった。夜の底が白くなった。信号所に汽車が止まった。 \
\n　向側の座席から娘が立って来て、島村の前のガラス窓を落した。雪の冷気が流れこんだ。 \
\n 娘は窓いっぱいに乗り出して、遠くへ呼ぶように、 \
\n 「駅長さあん、駅長さあん」 \
\n 明りをさげてゆっくり雪を踏んで来た男は、襟巻で鼻の上まで包み、耳に帽子の毛皮を垂れていた。 \
\n もうそんな寒さかと島村は外を眺めると、鉄道の官舎らしいバラックが山裾に寒々と散らばっているだけで、雪の色はそこまで行かぬうちに闇に呑まれていた。"

try_parse(text)

          |BOS/EOS   |*         |*         
国境        |名詞        |国境        |コッキョウ     
の         |助詞        |の         |ノ         
長い        |形容詞       |長い        |ナガイ       
トンネル      |名詞        |トンネル      |トンネル      
を         |助詞        |を         |ヲ         
抜ける       |動詞        |抜ける       |ヌケル       
と         |助詞        |と         |ト         
雪国        |名詞        |雪国        |ユキグニ      
で         |助動詞       |だ         |デ         
あっ        |助動詞       |ある        |アッ        
た         |助動詞       |た         |タ         
。         |記号        |。         |。         
夜         |名詞        |夜         |ヨル        
の         |助詞        |の         |ノ         
底         |名詞        |底         |ソコ        
が         |助詞        |が         |ガ         
白く        |形容詞       |白い        |シロク       
なっ        |動詞        |なる        |ナッ        
た         |助動詞       |た         |タ         
。         |記号        |。         |。         
信号所       |名詞        |信号所       |シンゴウショ    
に         |助詞        |に         

In [42]:
class WordDividor:
    INDEX_CATEGORY = 0
    INDEX_ROOT_FORM = 6
    TARGET_CATEGORIES = ["名詞", " 動詞",  "形容詞"]

    def __init__(self, dictionary="mecabrc"):
        self.dictionary = dictionary
        self.tagger = MeCab.Tagger(self.dictionary)

    def extract_words(self, text):
        if not text:
            return []

        words = []

        node = self.tagger.parseToNode(text)
        while node:
            features = node.feature.split(',')

            if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES:
                if features[self.INDEX_ROOT_FORM] != "*":
                    # prefer root form
                    words.append(features[self.INDEX_ROOT_FORM])
#                 else:
#                     words.append(node.surface) # do nothing

            node = node.next

        return words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

wd = WordDividor()
cv = CountVectorizer(analyzer=wd.extract_words)

text = ["雪。また雪。深い雪のラッセルにあえぎながら長く伸びた尾根を超える", "もちはもちや"]

counts = cv.fit_transform(text)
print(cv.vocabulary_)
print(counts)

{'雪': 5, '深い': 3, 'ラッセル': 1, '長い': 4, '尾根': 2, 'もち': 0}
  (0, 2)	1
  (0, 4)	1
  (0, 1)	1
  (0, 3)	1
  (0, 5)	3
  (1, 0)	2


In [98]:
from sklearn.feature_extraction.text import CountVectorizer

wd = WordDividor()
cv = CountVectorizer(analyzer=wd.extract_words)

text = achievements.item_desc
counts = cv.fit_transform(text)
# print(cv.vocabulary_)
# print(counts)

In [96]:
cv.vocabulary_['腕時計']

4713

In [81]:
repr(counts)

"<495034x5472 sparse matrix of type '<class 'numpy.int64'>'\n\twith 1498996 stored elements in Compressed Sparse Row format>"

In [107]:
data = counts.toarray()
target = np.array(achievements['price'])
feature_names = ['condition', 'item_desc']

In [108]:
from sklearn.datasets.base import Bunch
from sklearn.model_selection import  train_test_split

achievement_datasets = Bunch(
    data = data,
    target=target,
)

X_train, X_test, y_train, y_test = train_test_split(
    achievement_datasets.data,
    achievement_datasets.target,
    random_state =100
)

In [109]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
print("Test score: {:.2f}".format(model.score(X_test, y_test)))