In [2]:
import jieba
import pandas as pd
import torch
import numpy as np
from torch import nn
from transformers import TrainingArguments, Trainer, TrainerCallback
from torch.utils.tensorboard import SummaryWriter

In [3]:
class Vocab:
    def __init__(self, vocab_file, stop_words_file=None):
        self.stop_words_file = self.load_stop_words(stop_words_file)
        self.idx2word, self.word2idx, self.words = self.load_vocab(vocab_file)
        self.word_size = len(self.words)
        self.vocab_size = len(self.idx2word)

    def load_vocab(self, vocab_file):
        idx2word = {}
        word2idx = {}

        words = []
        contents = pd.read_csv(vocab_file, encoding="GBK", header=None)

        for idx, row in contents.iterrows():
            line = row[0]
            if not self.stop_words_file:
                current_line_words = [
                    word for word in jieba.cut(line) if word not in self.stop_words_file
                ]
            else:
                current_line_words = list(jieba.cut(line))
            words.extend(current_line_words)

        for idx, word in enumerate(set(words)):
            idx2word[idx] = word
            word2idx[word] = idx
        return idx2word, word2idx, words

    def load_stop_words(self, stop_words_file):
        if stop_words_file is None:
            return set()
        else:
            with open(stop_words_file, "r") as f:
                return set(f.read().splitlines())

    def get_idx(self, word):
        return self.word2idx[word]

    def get_word(self, idx):
        return self.idx2word[idx]

In [4]:
vocab = Vocab("./数学原始数据.csv", "./stopwords.txt")

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 1.740 seconds.
DEBUG:jieba:Loading model cost 1.740 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [5]:
vocab.word_size, vocab.vocab_size

(152832, 5296)

In [6]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, ngram: int, vocab: Vocab):
        self.ngram = ngram
        self.vocab = vocab
        self.word_size = vocab.word_size
        self.vocab_size = vocab.vocab_size

    def __len__(self):
        return self.word_size - 2 * self.ngram - 1

    def __getitem__(self, idx):
        left_idx = idx
        right_idx = idx + 2 * self.ngram + 1
        words = self.vocab.words[left_idx:right_idx]
        current_word = words.pop(self.ngram)
        label = self.vocab.get_idx(current_word)

        another_word = [self.vocab.get_idx(word) for word in words]
        return {
            "inputs": torch.tensor(another_word, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [7]:
data = MyDataset(2, vocab)

In [8]:
data_iter = torch.utils.data.DataLoader(data, batch_size=512, shuffle=True)

In [9]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.model = nn.Sequential(
            nn.Embedding(
                vocab_size,
                embedding_size,
            ),
            nn.Linear(
                embedding_size,
                vocab_size,
                bias=True,
            ),
        )

    def forward(self, inputs, labels=None):

        loss_fn = nn.CrossEntropyLoss()
        batch_size, ngram = inputs.shape
        # [batch_size, ngram] -> [batch_size * ngram]
        inputs = inputs.flatten()
        # [batch_size * ngram] -> [batch_size * ngram, vocab_size]
        inputs_logits = self.model(inputs)
        # [batch_size * ngram, vocab_size] -> [batch_size, ngram, vocab_size]
        inputs_logits = inputs_logits.reshape(batch_size, ngram, self.vocab_size)
        # [batch_size, ngram, vocab_size] -> [batch_size, vocab_size]
        inputs_logits = torch.mean(inputs_logits, dim=1)
        if labels is not None:
            # [batch_size, vocab_size] 和 [batch_size, vocab_size]
            loss = loss_fn(inputs_logits, labels)
            return {"logits": inputs_logits, "loss": loss}
        else:
            return {"logits": inputs_logits}

In [10]:
model = Net(vocab.vocab_size, 512)

In [11]:
class MyCallBacks(TrainerCallback):

    def on_train_begin(
        self, args, state, control, model, optimizer, lr_scheduler, **kwargs
    ):
        with SummaryWriter("./word2vec") as writer:
            inputs = torch.tensor([[1, 2, 3, 4]], dtype=torch.long)
            writer.add_graph(model, inputs, use_strict_trace=False)

        print("\nStarting training")
        print(f"\nUsing optimizer: {optimizer}")
        print(f"\nUsing lr_scheduler: {lr_scheduler}")

    def on_train_end(self, args, state, control, optimizer, **kwargs):
        print(f"\nlr: {optimizer.param_groups[0]['lr']}")

    def on_save(self, args, state, control, **kwargs):
        print("\nSaving model")

In [12]:
training_args = TrainingArguments(
    output_dir="./word2vec",
    num_train_epochs=3,
    logging_strategy="steps",
    logging_dir="./word2vec",
    save_strategy="epoch",
    use_cpu=False,
    save_total_limit=3,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    optimizers=(torch.optim.SGD(model.parameters(), 0.05), None),
    callbacks=[MyCallBacks],
)

In [14]:
trainer.train()


Starting training


Step,Training Loss
500,8.4038
1000,8.0537
1500,7.8364
2000,7.5875
2500,7.4279
3000,7.2909
3500,7.2097
4000,6.9994
4500,6.879
5000,6.8202



Saving model

Saving model

Saving model

Saving model

Ending training


TrainOutput(global_step=57312, training_loss=5.714975894861951, metrics={'train_runtime': 171.6222, 'train_samples_per_second': 2671.456, 'train_steps_per_second': 333.943, 'total_flos': 0.0, 'train_loss': 5.714975894861951, 'epoch': 3.0})

In [15]:
torch.save(model.state_dict(), "./word2vec.pth")

In [16]:
model.eval()

Net(
  (model): Sequential(
    (0): Embedding(5296, 512)
    (1): Linear(in_features=512, out_features=5296, bias=True)
  )
)

In [17]:
def cos(a, b):
    return a.dot(b) / (a.norm() * b.norm())

In [18]:
encoder = model.model[0]

token = "算术"
embedding1 = encoder(torch.tensor([vocab.get_idx(token)], device="cuda"))

token2similarity = {}

In [19]:
for idx, word in vocab.idx2word.items():
    embedding2 = encoder(torch.tensor([idx], device="cuda"))
    cos_similarity = cos(embedding1.flatten(), embedding2.flatten()).item()
    token2similarity[word] = cos_similarity

sorted(token2similarity, key=token2similarity.get, reverse=True)[:10]

['算术', '点积', 'radix', '开立方', '出', '引圆', '180', '圆弧', '右上角', '以少']