# 第8章: ニューラルネット

# 70. 単語ベクトルの和による特徴量

In [1]:
import gensim
import numpy as np
import pandas as pd

train = pd.read_csv('./NewsAggregatorDataset/train.txt', sep='\t', names=['CATEGORY', 'TITLE'])
valid = pd.read_csv('./NewsAggregatorDataset/valid.txt', sep='\t', names=['CATEGORY', 'TITLE'])
test = pd.read_csv('./NewsAggregatorDataset/test.txt', sep='\t', names=['CATEGORY', 'TITLE'])

In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
import string
import torch

def transform_w2v(text):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    words = text.translate(table).split()  # 記号をスペースに置換後、スペースで分割してリスト化
    vec = [model[word] for word in words if word in model]  # 1語ずつベクトル化

    return torch.tensor(sum(vec) / len(vec))  # 平均ベクトルをTensor型に変換して出力

# 特徴ベクトルの作成
X_train = torch.stack([transform_w2v(text) for text in train['TITLE']])
X_valid = torch.stack([transform_w2v(text) for text in valid['TITLE']])
X_test = torch.stack([transform_w2v(text) for text in test['TITLE']])

print(X_train.size())
print(X_train)

torch.Size([10672, 300])
tensor([[ 0.0930,  0.1105, -0.0617,  ...,  0.0472,  0.1016,  0.1064],
        [ 0.0773,  0.1216, -0.0084,  ...,  0.0125,  0.0744,  0.0549],
        [ 0.0144, -0.0348,  0.0461,  ...,  0.0654,  0.0590, -0.0838],
        ...,
        [-0.0356,  0.1175,  0.1018,  ..., -0.0172,  0.0627,  0.0960],
        [ 0.0226,  0.0159, -0.0263,  ...,  0.0301,  0.1126, -0.1359],
        [-0.0124, -0.0050, -0.0422,  ..., -0.0068,  0.1063, -0.0074]])


In [4]:
# ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = torch.tensor(train['CATEGORY'].map(lambda x: category_dict[x]).values)
y_valid = torch.tensor(valid['CATEGORY'].map(lambda x: category_dict[x]).values)
y_test = torch.tensor(test['CATEGORY'].map(lambda x: category_dict[x]).values)

print(y_train.size())
print(y_train)

torch.Size([10672])
tensor([2, 0, 0,  ..., 0, 0, 2])


In [5]:
# 保存
torch.save(X_train, 'X_train.pt')
torch.save(X_valid, 'X_valid.pt')
torch.save(X_test, 'X_test.pt')
torch.save(y_train, 'y_train.pt')
torch.save(y_valid, 'y_valid.pt')
torch.save(y_test, 'y_test.pt')

# 71. 単層ニューラルネットワークによる予測

In [6]:
from torch import nn

class SLPNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc = nn.Linear(input_size, output_size, bias=False)
        nn.init.normal_(self.fc.weight, 0.0, 1.0)  # 正規乱数で重みを初期化

    def forward(self, x):
        x = self.fc(x)
        return x
    
model = SLPNet(300, 4)  # 単層ニューラルネットワークの初期化
y_hat_1 = torch.softmax(model(X_train[:1]), dim=-1)
print(y_hat_1)

tensor([[0.0048, 0.0951, 0.8782, 0.0219]], grad_fn=<SoftmaxBackward0>)


In [7]:
# Y_hat = torch.softmax(model.forward(X_train[:4]), dim=-1)
# print(Y_hat)
y_hat_2 = torch.softmax(model(X_train[1:2]), dim=-1)
y_hat_3 = torch.softmax(model(X_train[2:3]), dim=-1)
y_hat_4 = torch.softmax(model(X_train[3:4]), dim=-1)

In [8]:
Y_hat = torch.softmax(model.forward(X_train[:4]), dim=-1)
print(Y_hat)

tensor([[0.0048, 0.0951, 0.8782, 0.0219],
        [0.4545, 0.0101, 0.5086, 0.0269],
        [0.8323, 0.0195, 0.0826, 0.0656],
        [0.0678, 0.0548, 0.7666, 0.1108]], grad_fn=<SoftmaxBackward0>)


# 72. 損失と勾配の計算

In [9]:
criterion = nn.CrossEntropyLoss()

In [10]:
l_1 = criterion(model(X_train[:1]), y_train[:1])  # 入力ベクトルはsoftmax前の値
model.zero_grad()  # 勾配をゼロで初期化
l_1.backward()  # 勾配を計算
print(f'損失: {l_1:.4f}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 0.1299
勾配:
tensor([[ 0.0004,  0.0005, -0.0003,  ...,  0.0002,  0.0005,  0.0005],
        [ 0.0088,  0.0105, -0.0059,  ...,  0.0045,  0.0097,  0.0101],
        [-0.0113, -0.0135,  0.0075,  ..., -0.0057, -0.0124, -0.0130],
        [ 0.0020,  0.0024, -0.0014,  ...,  0.0010,  0.0022,  0.0023]])


In [11]:
l = criterion(model(X_train[:4]), y_train[:4])
model.zero_grad()
l.backward()
print(f'損失: {l:.4f}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 0.3420
勾配:
tensor([[-0.0116, -0.0145, -0.0014,  ..., -0.0045, -0.0120, -0.0029],
        [ 0.0020,  0.0032, -0.0017,  ...,  0.0014,  0.0033,  0.0031],
        [ 0.0091,  0.0096,  0.0038,  ...,  0.0019,  0.0059, -0.0014],
        [ 0.0004,  0.0017, -0.0006,  ...,  0.0012,  0.0028,  0.0012]])


# 73. 確率的勾配降下法による学習

In [12]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, X, y):  # datasetの構成要素を指定
        self.X = X
        self.y = y
        
    def __len__(self):  # len(dataset)で返す値を指定
        return len(self.y)
    
    def __getitem__(self, idx):  # dataset[idx]で返す値を指定
        return [self.X[idx], self.y[idx]]

In [13]:
from torch.utils.data import DataLoader

# Datasetの作成
dataset_train = NewsDataset(X_train, y_train)
dataset_valid = NewsDataset(X_valid, y_valid)
dataset_test = NewsDataset(X_test, y_test)

# Dataloaderの作成
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)

In [14]:
# モデルの定義
model = SLPNet(300, 4)

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

# オプティマイザの定義
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

# 学習
num_epochs = 10
for epoch in range(num_epochs):
    # 訓練モードに設定
    model.train()
    loss_train = 0.0
    for i, (inputs, labels) in enumerate(dataloader_train):
        # 勾配をゼロで初期化
        optimizer.zero_grad()

        # 順伝播 + 誤差逆伝播 + 重み更新
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # 損失を記録
        loss_train += loss.item()
        
    # バッチ単位の平均損失計算
    loss_train = loss_train / i
    # 検証データの損失計算
    model.eval() 
    with torch.no_grad():
        inputs, labels = next(iter(dataloader_valid))
        outputs = model(inputs)
        loss_valid = criterion(outputs, labels)
        
    # ログを出力
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}')  

epoch: 1, loss_train: 0.4858, loss_valid: 0.3636
epoch: 2, loss_train: 0.3172, loss_valid: 0.3330
epoch: 3, loss_train: 0.2856, loss_valid: 0.3139
epoch: 4, loss_train: 0.2700, loss_valid: 0.3062
epoch: 5, loss_train: 0.2603, loss_valid: 0.3025
epoch: 6, loss_train: 0.2529, loss_valid: 0.3038
epoch: 7, loss_train: 0.2470, loss_valid: 0.2976
epoch: 8, loss_train: 0.2421, loss_valid: 0.3015
epoch: 9, loss_train: 0.2395, loss_valid: 0.2973
epoch: 10, loss_train: 0.2367, loss_valid: 0.2942


# 74. 正解率の計測

In [15]:
def calculate_accuracy(model, loader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

    return correct / total

In [16]:
acc_train = calculate_accuracy(model, dataloader_train)
acc_test = calculate_accuracy(model, dataloader_test)
print(f'正解率（学習データ）：{acc_train:.3f}')
print(f'正解率（評価データ）：{acc_test:.3f}')

正解率（学習データ）：0.921
正解率（評価データ）：0.903
