In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import functools as ft
import os

import joblib as jbl
import pandas as pd
import polars as pl
from tqdm import tqdm

from mts_ml_cup import preprocessing as prep

# Подготовка маппингов

In [3]:
regions, cities, manufacturers, models, types, oss, parts_of_day = (
    prep.raw.find_unique_cat_variables("../data/raw/competition_data_final_pqt/")
)

100%|███████████████████████████████████████████| 11/11 [02:22<00:00, 12.97s/it]


In [5]:
jbl.dump(dict(zip(sorted(regions), range(1, len(regions) + 1))), "../data/mappings/regions.jbl")

['../data/mappings/regions.jbl']

In [6]:
jbl.dump(dict(zip(sorted(cities), range(1, len(cities) + 1))), "../data/mappings/cities.jbl")

['../data/mappings/cities.jbl']

In [7]:
manufacturers_map = {
    "Alcatel": 1,
    "Apple": 2,
    "Asus": 3,
    "Atlas LLC": 4,
    "BQ Devices Limited": 5,
    "Blackview": 6,
    "Doke Communication (HK) Limited": 6,
    "Doogee": 7,
    "Google Inc": 8,
    "HTC": 9,
    "Highscreen": 10,
    "Highscreen Limited": 10,
    "Honor Device Company Limited": 11,
    "Huawei": 12,
    "Huawei Device Company Limited": 12,
    "Itel Technology Limited": 13,
    "LG": 14,
    "LeEco": 15,
    "Lenovo": 16,
    "Meizu": 17,
    "Motorola": 18,
    "Motorola Mobility LLC, a Lenovo Company": 18,
    "Nokia": 19,
    "OnePlus": 20,
    "Oppo": 21,
    "Realme Chongqing Mobile Telecommunications Corp Ltd": 22,
    "Realme Mobile Telecommunications (Shenzhen) Co Ltd": 22,
    "Samsung": 23,
    "Sony": 24,
    "Sony Mobile Communications Inc.": 24,
    "Tecno": 25,
    "Umi Network Technology Co Limited": 26,
    "Vingroup Joint Stock Company": 27,
    "Vivo": 28,
    "Xiaomi": 29,
    "Yandex LLC": 30,
    "ZTE": 31,
}
jbl.dump(manufacturers_map, "../data/mappings/manufacturers.jbl")

['../data/mappings/manufacturers.jbl']

In [8]:
mm = pd.Series(manufacturers_map, name="manufacturer_id").reset_index().rename(columns={"index": "manufacturer"})
mmm = pd.DataFrame(list(map(lambda m: m.split("_+_"), sorted(models))), columns=["manufacturer", "model"])
models_map = mmm.merge(mm)

In [9]:
models_map["manufacturer_model"] = models_map["manufacturer"] + "_+_" + models_map["model"]
models_map["manufacturer_id_model"] = models_map["manufacturer_id"].astype(str) + "_+_" + models_map["model"]

In [10]:
models_map["manufacturer_model"].nunique(), models_map["manufacturer_id_model"].nunique()

(603, 603)

In [11]:
jbl.dump(dict(zip(sorted(models), range(1, len(models) + 1))), "../data/mappings/models.jbl")

['../data/mappings/models.jbl']

In [13]:
jbl.dump(dict(zip(sorted(types), range(1, len(types) + 1))), "../data/mappings/types.jbl")

['../data/mappings/types.jbl']

In [5]:
os_map = {
    "Android": 1,
    "Apple iOS": 2,
    "iOS": 2,
}
jbl.dump(os_map, "../data/mappings/os.jbl")

['../data/mappings/os.jbl']

In [16]:
parts_of_day_map = {
    "morning": 1,
    "day": 2,
    "evening": 3,
    "night": 4,
}
jbl.dump(parts_of_day_map, "../data/mappings/parts_of_day.jbl")

['../data/mappings/parts_of_day.jbl']

# Подготовка облегченных партиций

In [3]:
convert_sessions_ = ft.partial(
    prep.raw.convert_sessions,
    regions_mapping=jbl.load("../data/mappings/regions.jbl"),
    cities_mapping=jbl.load("../data/mappings/cities.jbl"),
    manufacturers_mapping=jbl.load("../data/mappings/manufacturers.jbl"),
    models_mapping=jbl.load("../data/mappings/models.jbl"),
    types_mapping=jbl.load("../data/mappings/types.jbl"),
    os_mapping=jbl.load("../data/mappings/os.jbl"),
    parts_of_day_mapping=jbl.load("../data/mappings/parts_of_day.jbl"),
)

parts_path = "../data/raw/competition_data_final_pqt/"
for p in tqdm(os.listdir(parts_path)):
    if not p.endswith(".parquet"):
        continue
    i = int(p.split("-")[1])
    (
        convert_sessions_(
            pl.read_parquet(os.path.join(parts_path, p))
        )
        .write_parquet(f"../data/processed/sessions/part-{i}.parquet")
    )

100%|███████████████████████████████████████████| 11/11 [03:51<00:00, 21.08s/it]


In [4]:
%%time
df = pl.read_parquet("../data/processed/sessions/*")
df.estimated_size("gb")

CPU times: user 1min 15s, sys: 22.6 s, total: 1min 38s
Wall time: 23.1 s


13.44269884750247

In [7]:
df.write_parquet("../data/processed/sessions.pq")

# Подготовка трейна и теста

In [8]:
(
    prep.raw.convert_train(
        pl.read_parquet("../data/raw/public_train.pqt")
    )
    .write_parquet("../data/processed/train.pq")
)

In [9]:
(
    prep.raw.convert_test(
        pl.read_parquet("../data/raw/submit_2.pqt")
    )
    .write_parquet("../data/processed/test.pq")
)