In [None]:
import jieba
import pandas as pd
import torch
import numpy as np
from torch import nn

In [2]:
class Vocab:
    def __init__(self, vocab_file, stop_words_file=None):
        self.stop_words_file = self.load_stop_words(stop_words_file)
        self.idx2word, self.word2idx, self.words = self.load_vocab(vocab_file)
        self.vocab_size = len(self.words)

    def load_vocab(self, vocab_file):
        idx2word = {}
        word2idx = {}

        words = []
        contents = pd.read_csv(vocab_file, encoding="GBK", header=None)

        for idx, row in contents.iterrows():
            line = row[0]
            if not self.stop_words_file:
                current_line_words = [
                    word for word in jieba.cut(line) if word not in self.stop_words_file
                ]
            else:
                current_line_words = list(jieba.cut(line))
            words.extend(current_line_words)

        for idx, word in enumerate(set(words)):
            idx2word[idx] = word
            word2idx[word] = idx
        return idx2word, word2idx, words

    def load_stop_words(self, stop_words_file):
        if stop_words_file is None:
            return set()
        else:
            with open(stop_words_file, "r") as f:
                return set(f.read().splitlines())

    def get_idx(self, word):
        return self.word2idx[word]

    def get_word(self, idx):
        return self.idx2word[idx]

In [None]:
vocab = Vocab("./assets/数学原始数据.csv", "./assets/stopwords.txt")

In [33]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, vocab: Vocab, ngram: int):
        self.vocab = vocab
        self.ngram = ngram

    def __getitem__(self, idx):
        current_word = self.vocab.words[
            self.ngram : self.vocab.vocab_size - self.ngram
        ][idx]

        idx += self.ngram
        another_words = (
            self.vocab.words[idx - self.ngram : idx]
            + self.vocab.words[idx + 1 : idx + self.ngram + 1]
        )
        # zeros = np.zeros(self.vocab.vocab_size)
        # zeros[self.vocab.get_idx(current_word)] = 1
        # current_one_hot = zeros

        # another_one_hot = np.zeros((self.ngram * 2, self.vocab.vocab_size))
        # another_one_hot[
        #     range(self.ngram * 2), [self.vocab.get_idx(x) for x in another_words]
        # ] = 1

        return {"current_word": current_word, "another_words": another_words}

    def __len__(self):
        return len(self.vocab)

In [34]:
data = MyDataset(vocab, 2)

In [6]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(Net, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.model = nn.ModuleDict(
            {
                "onehot2embedding": nn.Linear(
                    vocab_size,
                    embedding_size,
                    bias=True,
                ),
                "embedding2logits": nn.Linear(
                    embedding_size,
                    vocab_size,
                    bias=True,
                ),
            }
        )

    def forward(self, x):
        logits = self.model(x)
        return logits

In [7]:
model = Net(vocab.vocab_size, 100)

In [None]:
model