# 0. Import stuff

In [2]:
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch

# 1. Load data

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
df = pd.read_parquet('./drive/MyDrive/AAA/course/avito_cv2vac_with_ranks.pq')

# 2. Make "embeddings"

In [14]:
df.loc[:, 'vac_embed'] = 1
df.loc[:, 'res_embed'] = 1

df.loc[:, 'vac_embed'] = df['vac_embed'].apply(lambda x: torch.rand((128)))
df.loc[:, 'res_embed'] = df['res_embed'].apply(lambda x: torch.rand((128)))

# 3. Make SiameseDataset and collate_fn

In [30]:
class SiameseDataset(Dataset):
  def __init__(self, df, vac_embed_column, res_embed_column, 
               res_des_column, label_column, rank_column, max_rank_column):
    """
     Create dataset for Siamese Net training.

     Parameters
     ----------
     df : pd.DataFrame
         the dataframe we create dataset from
     vac_embed_column: str
         name of the column of the vacancy embeddings
     res_embed_column: str
         name of the column of the resume embeddings
     res_des_column: str
         name of the column of the resume text description
     label_column: str
         name of the column of the vacancy embeddings
     rank_column: str
         name of the column of the resume rank 
     max_rank_column: str
         name of the column of the max resumes rank for this vacancy 
        
         Returns
     -------
     None
     """
    self.df = df[[vac_embed_column, res_embed_column, label_column, rank_column, max_rank_column]]

    self.vac_embed_column = vac_embed_column
    self.res_embed_column = res_embed_column
    self.label_column = label_column
    self.rank_column = rank_column
    self.max_rank_column = max_rank_column

    # предполагаю, что каждое резюме кидается ровно на 1 вакансию, составляя одну пару
    self.nunique_pairs = df[res_des_column].nunique()



  def __len__(self):
    """
     Return total amount of unique pairs: (vac_embed, res_embed).

     Parameters
     ----------
     None

     Returns
     -------
     int
         total amount of unique pairs: (vac_embed, res_embed) 
     """
    return self.nunique_pairs
  
  def __getitem__(self, idx):
    '''
     Return training object: (vac_embed, res_embed, label, rank, max_rank);
     Return rank and max_rank to penalty most appropriate samples more.

     Parameters
     ----------
     idx: int
         index of the samples we want to get.

     Returns
     -------
     tuple[torch.tensor]
         training object like a tuple: (vac_embed, res_embed, label, rank, max_rank)

    '''
    demandimg_row = self.df.iloc[idx, :]

    return demandimg_row[self.vac_embed_column], demandimg_row[self.res_embed_column], \
           torch.tensor(demandimg_row[self.label_column]), torch.tensor(demandimg_row[self.rank_column]), torch.tensor(demandimg_row[self.max_rank_column])

In [31]:
def collate_fn(data):
    """     
     Make dict samples from tuples (it is easier to use);

     Parameters
     ----------
       data: is a list of tuples with (vac_embed, res_embed, label, rank, max_rank)
      
    """
    vac_embed, res_embed, label, rank, max_rank = zip(*data)

    dict_data = {'vac_embed': vac_embed, 
                 'res_embed': res_embed,
                 'label': label,
                 'rank': rank,
                 'max_rank': max_rank}

    return dict_data

# 4. Create dataset amd dataloader instance 

In [32]:
dataset = SiameseDataset(df, 'vac_embed', 'res_embed', 'res_des', 'label', 'rank', 'max_rank')

In [35]:
dataloader = DataLoader(dataset, 128, shuffle=True, collate_fn=collate_fn)

# 5. Get a batch and overview it

In [36]:
batch = next(iter(dataloader))

In [37]:
# батч -- это словарь

type(batch)

dict

In [38]:
# его ключи 

batch.keys()

dict_keys(['vac_embed', 'res_embed', 'label', 'rank', 'max_rank'])

In [39]:
# внутри каждого ключа кортеж длиной batch_size

type(batch['vac_embed'])

tuple

In [40]:
len(batch['vac_embed'])

128

In [41]:
# каждый элемент уже то, что заявлено в ключах

type(batch['vac_embed'][0])

torch.Tensor

In [43]:
# размерность эмбеддинга

batch['vac_embed'][0].shape

torch.Size([128])