In [1]:
import sys
import os

# Add the project root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import pandas as pd
import pyarrow.parquet as pq
from pandas import DataFrame

In [3]:
from sklearn.pipeline import Pipeline
from src.parquet_zavod import parquet_zavod
from src.transformations import (
    attr_transformer,
    bert_64_transformer,
    categories_transformer,
)
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("sergeyzh/rubert-tiny-turbo")
model = BertModel.from_pretrained("sergeyzh/rubert-tiny-turbo")

pipeline = Pipeline(
    [   
        ("categories_initial", categories_transformer("categories")),
        ("attr_initial", attr_transformer("characteristic_attributes_mapping")),
        ("categories_embed", bert_64_transformer(model, tokenizer, "categories")),
        (
            "attr_embed",
            bert_64_transformer(model, tokenizer, "characteristic_attributes_mapping"),
        ),
    ]
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from pandas import DataFrame


def load_data(path):
    parquet_file = pq.ParquetFile(path)


    batch = next(parquet_file.iter_batches(batch_size=64))
    df: DataFrame = batch.to_pandas()
    return df

df = load_data("../data/train/merged_data.parquet")     


In [5]:
schema = pa.Table.from_pandas(pipeline.transform(df)).schema

In [6]:
schema

variantid: int64
categories: list<item: float>
  child 0, item: float
characteristic_attributes_mapping: list<item: float>
  child 0, item: float
main_pic_embeddings_resnet_v1: list<item: list<item: double>>
  child 0, item: list<item: double>
      child 0, item: double
pic_embeddings_resnet_v1: list<item: list<item: double>>
  child 0, item: list<item: double>
      child 0, item: double
name: string
description: string
name_bert_64: list<item: double>
  child 0, item: double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1363

In [7]:
parquet_zavod(
    "../data/train/merged_data.parquet",
    "../data/train/merged_data_prikol.parquet",
    schema,
    pipeline.transform,
    10_000,
)

ZOVod started...


Processing: 100%|██████████| 2252569/2252569 [1:07:42<00:00, 554.45rows/s]
