In [1]:
import re
from collections import Counter
import pickle

In [2]:
# 単語の最小出現回数
min_freq = 2

train_path = '../repo8/data70/train.txt'#todo 学習用ファイルパス
word2id_path = 'data/train.pkl'#todo 出力先ファイルパス


# タイトルに含まれる単語をすべてリストword_listに格納
word_list = []
with open(train_path, encoding="utf8") as f:
    for line in f:
        if not line:
            continue
        title = line.split('\t')[1]#todo タイトルを取得
        for word in title.split():
            word_list.append(word)


In [3]:
word_count = Counter(word_list)#todo Counterを使ってword_listの各単語の出現回数をカウント



In [4]:
dic=word_count.items()

In [61]:
# 単語をIDに変換するためのdict．keyを単語，valueをIDにする．
word2id = {}

id = 1
for word, freq in sorted(dic, reverse=True):
    if freq<min_freq:#todo その単語の出現頻度がmin_freqよりも小さい場合:
        word2id[word] = 0#todo 単語のIDは0とする
    else:
        word2id[word] = id#todo 単語にIDを割り当てる
        id=id+1#todo IDをインクリメント
print(word2id)

# 表示
print("word_num", len(word2id))
# 保存
pickle.dump(word2id, open(word2id_path, "wb"))





word_num 23620


In [92]:
import pickle
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pad_sequence, pack_padded_sequence


word2id_path = "data/train.pkl"#todo 80.pyで保存したファイルパスを指定
word2id = pickle.load(open(word2id_path, "rb"))



In [142]:
# word_listの単語リストをIDリストに変換する関数
def get_ids(word_list):
    ids = []
    for word in word_list:
        if word in word2id.keys():#todo wordがword2idのキーに含まれる場合，
            ids.append(word2id[word])#todo そのIDをidsにappend
        else:
            ids.append(0)#todo wordがword2idのキーに含まれない場合，0をidsにappend
    return ids


def get_data(fname):
    label_list = [] # ラベル(カテゴリーのid)を格納するリスト
    ids_list = [] # タイトルに含まれる単語のID系列のtensorを格納するリスト

    category2num = {"b": 0, "t": 1, "e": 2, "m": 3}

    with open(fname, encoding="utf8") as f:
        for line in f:
            if not line:
                continue
            line = line.strip()
            title = line.split('\t')[1]#todo lineからタイトルを取得
            word_list = title.split(' ') #todo titleを単語に分割
            ids = get_ids(word_list)
            ids_tensor = torch.tensor(ids)#todo idsをtensorに変換
            ids_list.append(ids_tensor)#todo ids_tensorをids_listにappend

            category = line.split('\t')[0]#todo lineからカテゴリーを取得
            label = category2num[category]#todo category2numでカテゴリーをラベルに変換
            label_list.append(label)#todo labelをlabel_listにappend
          
    labels = torch.tensor(label_list)#todo label_listをtensorに変換
    return ids_list, labels


class ClassifierRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size):
        super(ClassifierRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)#todo 単語埋め込み層
        self.rnn = nn.LSTM(embedding_dim, hidden_dim) #todo LSTM層
        self.linear = nn.Linear(hidden_dim, target_size) #todo 線形層
        self.softmax = nn.Softmax()
    
    def forward(self, x, x_len):  
        embed_x = self.embedding(x) #todo xを単語埋め込み
 
        packed_x = pack_padded_sequence(embed_x, x_len, batch_first=True, enforce_sorted=False) #todo embed_xとx_lenを用いてPackedSequence型のデータに変換
 
        _, (h, _) = self.rnn(packed_x) #todo LSTM層に入力
        output = self.linear(h) #todo 線形層に入力
 
        output = torch.squeeze(output, dim = 0) #todo squeeze()を使用し，outputの形状が[1, batch_size, target_size]となっているものを[batch_size, target_size]に変換
 
        #print(output.size()) #確認用(確認する場合はコメントアウトを解除)
        output = self.softmax(output)
        return output


class RnnDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        self.length = [len(v) for v in data]#todo dataに含まれる各ID系列の長さのリスト
        self.label = label

        self.pad_data = pad_sequence(data, batch_first=True)#todo dataをpadding padsequenceを使う

    def __len__(self):
        return len(self.pad_data)

    def __getitem__(self, idx):
        return self.pad_data[idx], self.length[idx], self.label[idx]#todo idxに対応するpad_data, length, labelの3つを返り値とする


In [143]:
emb_dim = 300
hidden_dim = 50
target_size = 4
batch_size = 64

vocab_size = max(word2id.values()) + 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# データの準備
train_path = '../repo8/data70/train.txt'#todo 学習用ファイルパス
train_x, train_y = get_data(train_path)
valid_path = '../repo8/data70/valid.txt'#todo 評価用ファイルパス
valid_x, valid_y = get_data(valid_path)



dataset = RnnDataset(train_x, train_y)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = ClassifierRNN(vocab_size, emb_dim, hidden_dim, target_size)
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.CrossEntropyLoss()


def train(model, train_loader, len_train):
    model.train()
    total_loss = 0
    correct_num = 0
    len_loader = len(train_loader)

    for data, data_len, target in train_loader:
        # GPUへ
        data = data.to(device)
        data_len = data_len.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        pred = model(data, data_len)#todo modelで予測
        loss = loss_fn(pred, target)#todo lossを計算
        loss.backward()
        optimizer.step()

        total_loss+= loss.item() #todo lossをtotal_lossに加算
        correct_num += (torch.argmax(pred, axis=1)==target).sum().item() #todo 予測の正解数をカウント
        #print(correct_num)
            
    batch_loss = total_loss/len_loader #バッチごとの平均ロス
    acc = correct_num/len_train#todo 正解率
    return batch_loss, acc


# validデータはそのままGPUへ
valid_x_len = torch.tensor([len(v) for v in valid_x])#todo valid_xの各要素の長さのリスト

valid_pad_x = pad_sequence(valid_x, batch_first=True)#todo valid_xをpadding

# GPUへ
valid_x_len = valid_x_len.to(device)
valid_pad_x = valid_pad_x.to(device)
valid_y = valid_y.to(device)



def evaluation(model, data, data_len, target):
    model.eval()
    with torch.no_grad():
        pred = model(data, data_len)#todo 予測
        loss = loss_fn(pred, target)#todo 損失を計算
        acc = (torch.argmax(pred, axis=1)==target).sum().item()/len(target)#todo 正解率を計算
    return loss.item(), acc


len_train = len(train_y)

for epoch in range(30):
    train_loss, train_acc = train(model, train_loader, len_train)
    valid_loss, valid_acc = evaluation(model, valid_pad_x, valid_x_len, valid_y)

    print(f"epoch: {epoch}")
    print(f"<train> Loss: {train_loss}\tAccuracy: {train_acc}")
    print(f"<valid> Loss: {valid_loss}\tAccuracy: {valid_acc}")



epoch: 0
<train> Loss: 1.3454175023975488	Accuracy: 0.3919032892887265
<valid> Loss: 1.297091007232666	Accuracy: 0.4362818590704648
epoch: 1
<train> Loss: 1.2829074959555071	Accuracy: 0.48402211601536876
<valid> Loss: 1.255295991897583	Accuracy: 0.5209895052473763
epoch: 2
<train> Loss: 1.2487917867249358	Accuracy: 0.5402492737325462
<valid> Loss: 1.2268624305725098	Accuracy: 0.545727136431784
epoch: 3
<train> Loss: 1.2240004318203042	Accuracy: 0.5462468372223784
<valid> Loss: 1.2067915201187134	Accuracy: 0.5554722638680659
epoch: 4
<train> Loss: 1.2062849334614005	Accuracy: 0.5496204666854091
<valid> Loss: 1.192435622215271	Accuracy: 0.5652173913043478
epoch: 5
<train> Loss: 1.1924559234858987	Accuracy: 0.5555243182457127
<valid> Loss: 1.1810437440872192	Accuracy: 0.5689655172413793
epoch: 6
<train> Loss: 1.1800860573431688	Accuracy: 0.563021272608003
<valid> Loss: 1.1702486276626587	Accuracy: 0.5847076461769115
epoch: 7
<train> Loss: 1.1676137604399355	Accuracy: 0.5799831318526848
<v

最終的な精度は0.78、0.76になった。trainとvalidの値が大きく離れていないことから過学習にならず学習できたことがわかる。