In [1]:
import jieba
import pandas as pd

In [2]:
class Vocab:
    def __init__(self, vocab_file, stop_words_file=None):
        self.stop_words_file = self.load_stop_words(stop_words_file)
        self.idx2word, self.word2idx, self.vocab = self.load_vocab(vocab_file)
        self.vocab_size = len(self.idx2word)

    def load_vocab(self, vocab_file):
        idx2word = {}
        word2idx = {}
        vocab = []
        contents = pd.read_csv(vocab_file, encoding="GBK", header=None)

        for idx, row in contents.iterrows():
            line = row[0]
            if not self.stop_words_file:
                current_line_words = [
                    word for word in jieba.cut(line) if word not in self.stop_words_file
                ]

            else:
                current_line_words = list(jieba.cut(line))
            vocab += current_line_words

        for idx, word in enumerate(vocab):
            idx2word[idx] = word
            word2idx[word] = idx
        return idx2word, word2idx, vocab

    def load_stop_words(self, stop_words_file):
        if stop_words_file is None:
            return set()
        else:
            with open(stop_words_file, "r") as f:
                return set(f.read().splitlines())

    def get_idx(self, word):
        return self.word2idx[word]

    def get_word(self, idx):
        return self.idx2word[idx]

In [None]:
vocab = Vocab("./assets/数学原始数据.csv", "./assets/stopwords.txt")

In [None]:
import torch
import numpy as np
from torch import nn

In [5]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(Net, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.model = nn.ModuleDict(
            {
                "onehot2embedding": nn.Linear(
                    vocab_size,
                    embedding_size,
                    bias=True,
                ),
                "embedding2logits": nn.Linear(
                    embedding_size,
                    vocab_size,
                    bias=True,
                ),
            }
        )

    def forward(self, x):
        logits = self.model(x)
        return logits

In [6]:
model = Net(vocab.vocab_size, 100)

In [None]:
model

In [57]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, vocab: Vocab, ngram: int):
        self.vocab_object = vocab
        self.vocab = vocab.vocab
        self.vocab_size = vocab.vocab_size
        self.ngram = ngram

    def __getitem__(self, idx):
        current_word = self.vocab_object.get_word(idx)
        left_idx = max(0, idx - self.ngram)
        right_idx = min(self.vocab_size, idx + self.ngram)

        annother_words = (
            self.vocab[left_idx + 1 : idx] + self.vocab[idx + 1 : right_idx]
        )

        return {"current_word": current_word, "annother_word": annother_words}

    def __len__(self):
        return len(self.vocab)