In [11]:
w2v_model = '/Users/iijima.s.ad/git/JASen/word2vec_100.txt'
article_folder_path = '/Users/iijima.s.ad/git/article-extractor/articles/fox/drones/*/'

In [3]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(w2v_model, binary=False)

In [66]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [55]:
import numpy as np

def score_lg(lg, X):
    return [np.max(lg.predict_proba(X), axis=1), lg.predict(X)]

In [8]:
import string
import torch

def transform_w2v(text):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    words = text.translate(table).split()  # 記号をスペースに置換後、スペースで分割してリスト化
    vec = [model[word] for word in words if word in model]  # 1語ずつベクトル化

    return torch.tensor(sum(vec) / len(vec))  # 平均ベクトルをTensor型に変換して出力

In [59]:
import glob
import os
import random

X, Y = [], []

article_folders = glob.glob(article_folder_path)
for folder in article_folders:
    if not os.path.exists(os.path.join(folder, 'prediction.txt')):
        continue

    with open(os.path.join(folder, 'article.txt'), 'r') as f:
        X.append(transform_w2v(f.read()))

    with open(os.path.join(folder, 'prediction.txt'), 'r') as f:
        predictions = [p.strip().split('\t') for p in f.readlines()[1:]]
        Y.append(int(len([p for p in predictions if p[0] == '0' and p[1] == '1']) / len(predictions) >= 0.05))

length = len(X)
test_key = random.sample(list(range(length)), int(length * 0.2))
test_X = torch.stack([X[i] for i in test_key])
test_Y = torch.tensor([Y[i] for i in test_key])
train_X = torch.stack([X[i] for i in set(range(length)) - set(test_key)])
train_Y = torch.tensor([Y[i] for i in set(range(length)) - set(test_key)])

In [60]:
from sklearn.linear_model import LogisticRegression

# モデルの学習
lg = LogisticRegression(random_state=123, max_iter=10000)
lg.fit(train_X, train_Y)

In [61]:
train_pred = score_lg(lg, train_X)
test_pred = score_lg(lg, test_X)

print(train_pred)

[array([0.77020736, 0.73567997, 0.74437401, 0.7479974 , 0.74099586,
       0.71531823, 0.7576728 , 0.74387929, 0.75730756, 0.7329194 ,
       0.75543198, 0.76630234, 0.73567124, 0.7527435 , 0.73402563,
       0.77677383, 0.76812368, 0.71751422, 0.77117533, 0.76929976,
       0.76980462, 0.74163611, 0.76972432, 0.75593681, 0.73675523,
       0.74976048, 0.74431104, 0.76654026, 0.7641925 , 0.75772799,
       0.7674642 , 0.74478835, 0.72869945, 0.75951563, 0.74311565,
       0.71735018, 0.74195501, 0.74984588, 0.72634943, 0.76613019,
       0.72810078, 0.75202339, 0.75835087, 0.75537422, 0.75728742,
       0.74973555, 0.76470747, 0.77229315, 0.75922943, 0.73332757,
       0.76578481, 0.77129403, 0.74425415, 0.73121873, 0.72085405,
       0.75270212, 0.7264185 , 0.76937888, 0.73886086, 0.72346371,
       0.77495164, 0.74799438, 0.7679873 , 0.76911906, 0.72919385,
       0.75106738, 0.77185465, 0.72275254, 0.74627057, 0.7487969 ,
       0.74061348, 0.75974647]), array([0, 0, 0, 0, 0, 0, 0, 

In [62]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(train_Y, train_pred[1])
test_accuracy = accuracy_score(test_Y, test_pred[1])
print(f'正解率（学習データ）：{train_accuracy:.3f}')
print(f'正解率（評価データ）：{test_accuracy:.3f}')

正解率（学習データ）：0.750
正解率（評価データ）：0.647


In [65]:
print(test_Y)
print(test_pred)

tensor([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1])
[array([0.76515881, 0.74737575, 0.74771998, 0.73501843, 0.76051345,
       0.74548753, 0.73610818, 0.74629342, 0.7455148 , 0.71815383,
       0.70884724, 0.73451448, 0.75901439, 0.72837725, 0.7606393 ,
       0.76351621, 0.75639165]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
