# 準備

googledriveのマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


MeCabのインストール

In [2]:
!pip install mecab-python3
!pip install unidic-lite

Collecting mecab-python3
  Downloading mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (488 kB)
[K     |████████████████████████████████| 488 kB 5.2 MB/s 
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.4
Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[K     |████████████████████████████████| 47.4 MB 2.2 MB/s 
[?25hBuilding wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658836 sha256=2ffb253145c207a831f1367ef871d13f8d36473ad77597b023ada83d0a636586
  Stored in directory: /root/.cache/pip/wheels/de/69/b1/112140b599f2b13f609d485a99e357ba68df194d2079c5b1a2
Successfully built unidic-lite
Installing collected packages: unidic-lite
Successfully installed unidic-lite-1.0.8


tarファイルの展開

In [3]:
import tarfile
tar = tarfile.open('/content/drive/MyDrive/ldcc-20140209.tar.gz')
tar.extractall('/content/drive/MyDrive/natural_language_processing')
tar.close()

KeyboardInterrupt: ignored

# 文書の下処理

全ての形態素を読み込む

In [None]:
import os
import MeCab
path = "/content/drive/MyDrive/natural_language_processing/text/"

In [None]:
categories1 = []
article_list1 = []
labels1 = []

#カテゴリ名の取得
for dir_path1 in os.listdir(path):
  #text配下に指定のディレクトリが存在する場合、categoriesにリストを追加
  if os.path.isdir(os.path.join(path, dir_path1)):
    categories1.append(dir_path1)

#カテゴリごとの記事の取得
for category1 in categories1:
  #カテゴリー毎にディレクトリ配下にあるファイル一覧をarticlesに格納
    articles1 = os.listdir(path+category1)

    for article1 in articles1:
      #記事の一覧からそれぞれの記事をtextにリスト化
        with open(path+ category1+ "/"+article1, encoding="utf-8") as f:
            next(f)
            next(f)
            text1 = f.read()

            #text1をarticle_listに追加
            article_list1.append(text1)
            #categoryのインデックスをlabelsに追加
            labels1.append(categories1.index(category1))

動詞、名詞、形容詞を抽出して読み込む

In [None]:
categories2 = []
article_list2 = []
labels2 = []
select_conditions = ['動詞', '名詞','形容詞']

# 分かち書きオブジェクト
tagger = MeCab.Tagger('')
tagger.parse('')

#カテゴリ名の取得
for dir_path2 in os.listdir(path):
  if os.path.isdir(os.path.join(path, dir_path2)):
    categories2.append(dir_path2)

#カテゴリごとの記事の取得
for category2 in categories2:
    articles2 = os.listdir(path+category2)

    for article2 in articles2:
        with open(path+ category2+ "/"+article2, encoding="utf-8") as f:
            next(f)
            next(f)
            text2 = f.read()

            #最初のnodeを取得
            node = tagger.parseToNode(text2)
            morphemes = []
            while node:
              #形態素をmorphomeに格納
              morpheme = node.surface
              #文字の特徴を','で区切った先頭要素（品詞）をposに格納
              pos = node.feature.split(',')[0]

              #品詞がselect_conditionsに含まれるノードのみmorphomesに形態素のリストを追加する
              if pos in select_conditions:
                  morphemes.append(morpheme)

              #次のノードに移る
              node = node.next

            #morphomesの要素を半角スペースで分割
            text_result = ' '.join(morphemes)

            #text_resultをarticle_listに追加
            article_list2.append(text_result)
            #categoryのインデックスをlabelsに追加
            labels2.append(categories2.index(category2))

テストデータとの分割

In [None]:
from sklearn.model_selection import train_test_split

traindata1, testdata1, train_labels1, test_labels1 = train_test_split(article_list1, labels1, test_size=0.25, random_state=42)
traindata2, testdata2, train_labels2, test_labels2 = train_test_split(article_list2, labels2, test_size=0.25, random_state=42)

# 文章のベクトル化

Bag_of_wordsでのベクトル分割

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#'(?u)\\b\\w+\\b'=“単語の境界","1文字以上の単語構成文字","単語の境界”
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
train_dataB1 = vectorizer.fit_transform(traindata1)
test_dataB1 = vectorizer.transform(testdata1)
train_dataB2 = vectorizer.fit_transform(traindata2)
test_dataB2 = vectorizer.transform(testdata2)

TF*IDFでのベクトル分割



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#'(?u)\\b\\w+\\b'=“単語の境界","1文字以上の単語構成文字","単語の境界”
vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
train_dataT1 = vectorizer.fit_transform(traindata1)
test_dataT1 = vectorizer.transform(testdata1)
train_dataT2 = vectorizer.fit_transform(traindata2)
test_dataT2 = vectorizer.transform(testdata2)

# 分類器の作成と検証

ランダムフォレスト

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc

score_rfc = []

def result(a, b, c, d):
  cl = rfc(n_estimators=10)
  cl.fit(a, b)
  score = cl.score(c, d)
  return score

score_rfc.append(result(train_dataB1, train_labels1, test_dataB1, test_labels1))
score_rfc.append(result(train_dataT1, train_labels1, test_dataT1, test_labels1))
score_rfc.append(result(train_dataB2, train_labels2, test_dataB2, test_labels2))
score_rfc.append(result(train_dataT2, train_labels2, test_dataT2, test_labels2))

score_rfc

[0.8557483731019523,
 0.8486984815618221,
 0.8432754880694143,
 0.8351409978308026]

ナイーブベイズ

In [None]:
from sklearn.naive_bayes import MultinomialNB as NB

score_NB = []

def result(a, b, c, d):
  cl_hard =  MultinomialNB()
  cl_hard.fit(a, b)
  score = cl_hard.score(c, d)
  return score

score_NB.append(result(train_dataB1, train_labels1, test_dataB1, test_labels1))
score_NB.append(result(train_dataT1, train_labels1, test_dataT1, test_labels1))
score_NB.append(result(train_dataB2, train_labels2, test_dataB2, test_labels2))
score_NB.append(result(train_dataT2, train_labels2, test_dataT2, test_labels2))

score_NB

[0.9040130151843818,
 0.8796095444685467,
 0.8937093275488069,
 0.8503253796095445]

ロジスティック回帰

In [None]:
from sklearn.linear_model import LogisticRegression as LR

score_LR = []

def result(a, b, c, d):
  cl = LR(multi_class='multinomial', solver='newton-cg')
  cl.fit(a, b)
  score = cl.score(c, d)
  return score

score_LR.append(result(train_dataB1, train_labels1, test_dataB1, test_labels1))
score_LR.append(result(train_dataT1, train_labels1, test_dataT1, test_labels1))
score_LR.append(result(train_dataB2, train_labels2, test_dataB2, test_labels2))
score_LR.append(result(train_dataT2, train_labels2, test_dataT2, test_labels2))

score_LR

[0.9148590021691974, 0.9018438177874186, 0.9408893709327549, 0.911062906724512]

SVM

In [None]:
from sklearn.svm import LinearSVC as svc

score_SVC = []

def result(a, b, c, d):
  cl_soft=svc(loss='hinge')
  cl_soft.fit(a, b)
  score = cl_soft.score(c, d)
  return score

score_SVC.append(result(train_dataB1, train_labels1, test_dataB1, test_labels1))
score_SVC.append(result(train_dataT1, train_labels1, test_dataT1, test_labels1))
score_SVC.append(result(train_dataB2, train_labels2, test_dataB2, test_labels2))
score_SVC.append(result(train_dataT2, train_labels2, test_dataT2, test_labels2))

score_SVC



[0.9224511930585684,
 0.9262472885032538,
 0.9419739696312365,
 0.9360086767895879]

In [None]:
print("～Bag_of_words～")
print("ランダムフォレスト    ：",end="")
print('{:.3g}'.format(score_rfc[0]))
print("ランダムフォレスト[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_rfc[2]))
print("ナイーブベイズ    ：",end="")
print('{:.3g}'.format(score_NB[0]))
print("ナイーブベイズ[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_NB[2]))
print("ロジスティック回帰    ：",end="")
print('{:.3g}'.format(score_LR[0]))
print("ロジスティック回帰[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_LR[2]))
print("SVM    ：",end="")
print('{:.3g}'.format(score_SVC[0]))
print("SVM[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_SVC[2]))

print("\n\n～TF*IDF～")
print("ランダムフォレスト    ：",end="")
print('{:.3g}'.format(score_rfc[1]))
print("ランダムフォレスト[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_rfc[3]))
print("ナイーブベイズ    ：",end="")
print('{:.3g}'.format(score_NB[1]))
print("ナイーブベイズ[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_NB[3]))
print("ロジスティック回帰    ：",end="")
print('{:.3g}'.format(score_LR[1]))
print("ロジスティック回帰[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_LR[3]))
print("SVM    ：",end="")
print('{:.3g}'.format(score_SVC[1]))
print("SVM[名詞,動詞,形容詞]   ：",end="")
print('{:.3g}'.format(score_SVC[3]))

～Bag_of_words～
ランダムフォレスト    ：0.856
ランダムフォレスト[名詞,動詞,形容詞]   ：0.843
ナイーブベイズ    ：0.904
ナイーブベイズ[名詞,動詞,形容詞]   ：0.894
ロジスティック回帰    ：0.915
ロジスティック回帰[名詞,動詞,形容詞]   ：0.941
SVM    ：0.922
SVM[名詞,動詞,形容詞]   ：0.942


～TF*IDF～
ランダムフォレスト    ：0.849
ランダムフォレスト[名詞,動詞,形容詞]   ：0.835
ナイーブベイズ    ：0.88
ナイーブベイズ[名詞,動詞,形容詞]   ：0.85
ロジスティック回帰    ：0.902
ロジスティック回帰[名詞,動詞,形容詞]   ：0.911
SVM    ：0.926
SVM[名詞,動詞,形容詞]   ：0.936
