In [1]:
import json
import pickle

import torch
import numpy as np

from spacy.lang.en import English
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm

from utils.preprocessing import spacy_tokenize, dummy_fn
from utils.autoskill_torch import SkillDataset, collate_fn

In [2]:
with open('data/labels.json', 'r') as f:
    labels_map = json.load(f)

with open('data/dataset.json', 'r') as f:
    data = json.load(f)
    
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
len(train), len(test)

(8452, 2113)

In [4]:
tokenizer = English().tokenizer
tfidf = pickle.load(open("models/tfidf_3_08_300.pkl", 'rb'))
tfidf

TfidfVectorizer(lowercase=False, max_df=0.8, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000020E13D974C0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000020E13D974C0>)

In [5]:
train_dataset = SkillDataset(
    train, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=tfidf
)

len(train_dataset)

8452

In [6]:
print(train_dataset[42][0].shape, '\n') # x_vec
print(train_dataset[42][1], '\n') # y labels: ([midas_id, entity_id], midas_and_entity_id)

(1023,) 

([2, 10], 17) 



In [7]:
test_dataset = SkillDataset(
    test, labels_map, 
    tokenizer=tokenizer,
    tfidf_model=tfidf
)

len(test_dataset)

2113

In [8]:
train_loader = DataLoader(
    train_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

test_loader = DataLoader(
    test_dataset, batch_size=32, 
    shuffle=True, collate_fn=collate_fn)

In [9]:
for batch in train_loader:
    break

batch[0].shape, batch[1].shape, batch[2].shape

(torch.Size([32, 1023]), torch.Size([32, 2]), torch.Size([32]))

In [10]:
progress_bar = tqdm(total=len(train_loader.dataset), desc='Testing')

for x, y_m, y_s in train_loader:
    progress_bar.update(x.size(0))
    
progress_bar.close()

Testing:   0%|          | 0/8452 [00:00<?, ?it/s]

In [11]:
progress_bar = tqdm(total=len(test_loader.dataset), desc='Testing')

for x, y_m, y_s in test_loader:
    progress_bar.update(x.size(0))
    
progress_bar.close()

Testing:   0%|          | 0/2113 [00:00<?, ?it/s]