In [None]:
import pandas as pd
import re
import wandb
import jieba
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torchtext
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [None]:
def build_vocab(df, stopwords):
    counter = Counter()
    print('building vocab...')
    for title in tqdm(df['title']):
        title = re.sub(r'[^\u4e00-\u9fff]', '', title)
        tokens = [token for token in jieba.cut(title.strip()) if token not in stopwords]
        counter.update(tokens)
    vocab = torchtext.vocab.vocab(counter, specials=['<unk>'])
    return vocab

In [None]:
df = pd.read_csv('../../datasets/THUCNews/title.csv')

with open('../stopwords/cn_stopwords.txt') as f:
    stopwords = [line.strip() for line in f.readlines()]

vocab = build_vocab(df, stopwords)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, vocab, stopwords):
        super().__init__()
        print('building dataset...')
        self.inputs = []
        self.labels = []
        for title in tqdm(df['title']):
            title = re.sub(r'[^\u4e00-\u9fff]', '', title)
            tokens = [token for token in jieba.cut(title.strip()) if token not in stopwords]
            for i in range(1, len(tokens) - 1):
                self.inputs.append([vocab[tokens[i]]])
                self.labels.append([vocab[tokens[i - 1]]])
                self.inputs.append([vocab[tokens[i]]])
                self.inputs.append([vocab[tokens[i + 1]]])
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.inputs[idx]), torch.LongTensor(self.labels[idx])

In [None]:
df_train, df_valid = train_test_split(df, test_size=0.3)

In [None]:
train_dataset = MyDataset(df_train, vocab, stopwords)
valid_dataset = MyDataset(df_valid, vocab, stopwords)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1024, shuffle=False, drop_last=True)

In [None]:
x, y = next(iter(train_dataloader))

In [None]:
x.shape, y.shape

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output = self.fc(self.relu(embedded))
        return output
        

In [None]:
model = Word2Vec(vocab_size=len(vocab), hidden_size=128)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
wandb.init(project='Word2Vec')

for epoch in range(50):
    model.train()
    total_train_loss = 0
    num_x = 0
    for x, y in tqdm(train_dataloader):
        x = x.to(device)
        y = y.to(device)
        num_x += x.shape[0]
        output = model(x)
        loss = criterion(output.squeeze_(), y.squeeze_())
        total_train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss = total_train_loss / num_x
    
    model.eval()
    total_valid_loss = 0
    num_x = 0
    for x, y in tqdm(valid_dataloader):
        x = x.to(device)
        y = y.to(device)
        num_x += x.shape[0]
        output = model(x)
        loss = criterion(output.squeeze_(), y.squeeze_())
        total_valid_loss += loss.item()
    valid_loss = total_valid_loss / num_x
    
    wandb.log({'train loss': train_loss, 'valid loss': valid_loss })