In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import functools as ft
import os

import joblib as jbl
import pandas as pd
import polars as pl
from tqdm import tqdm

from mts_ml_cup import preprocessing as prep

# Подготовка маппингов

In [3]:
regions = set()
cities = set()
manufacturers = set()
models = set()
oss = set()
parts_of_day = set()

parts_path = "../data/raw/competition_data_final_pqt/"
for p in tqdm(os.listdir(parts_path)):
    if not p.endswith(".parquet"):
        continue
    part = (
        pl.read_parquet(os.path.join(parts_path, p))
        .with_columns(
            [
                pl.concat_str([pl.col("region_name"), pl.col("city_name")], sep="_+_").alias("city_name"),
                pl.concat_str([pl.col("cpe_manufacturer_name"), pl.col("cpe_model_name")], sep="_+_").alias("cpe_model_name"),
            ]
        )
    )
    regions |= set(part["region_name"].unique())
    cities |= set(part["city_name"].unique())
    manufacturers |= set(part["cpe_manufacturer_name"].unique())
    models |= set(part["cpe_model_name"].unique())
    oss |= set(part["cpe_model_os_type"].unique())
    parts_of_day |= set(part["part_of_day"].unique())

100%|███████████████████████████████████████████| 11/11 [01:41<00:00,  9.25s/it]


In [4]:
manufacturers_map = {
    "Alcatel": 1,
    "Apple": 2,
    "Asus": 3,
    "Atlas LLC": 4,
    "BQ Devices Limited": 5,
    "Blackview": 6,
    "Doke Communication (HK) Limited": 6,
    "Doogee": 7,
    "Google Inc": 8,
    "HTC": 9,
    "Highscreen": 10,
    "Highscreen Limited": 10,
    "Honor Device Company Limited": 11,
    "Huawei": 12,
    "Huawei Device Company Limited": 12,
    "Itel Technology Limited": 13,
    "LG": 14,
    "LeEco": 15,
    "Lenovo": 16,
    "Meizu": 17,
    "Motorola": 18,
    "Motorola Mobility LLC, a Lenovo Company": 18,
    "Nokia": 19,
    "OnePlus": 20,
    "Oppo": 21,
    "Realme Chongqing Mobile Telecommunications Corp Ltd": 22,
    "Realme Mobile Telecommunications (Shenzhen) Co Ltd": 22,
    "Samsung": 23,
    "Sony": 24,
    "Sony Mobile Communications Inc.": 24,
    "Tecno": 25,
    "Umi Network Technology Co Limited": 26,
    "Vingroup Joint Stock Company": 27,
    "Vivo": 28,
    "Xiaomi": 29,
    "Yandex LLC": 30,
    "ZTE": 31,
}

In [6]:
jbl.dump(manufacturers_map, "../data/mappings/manufacturers.jbl")

['../data/mappings/manufacturers.jbl']

In [7]:
jbl.dump(dict(zip(sorted(regions), range(1, len(regions) + 1))), "../data/mappings/regions.jbl")

['../data/mappings/regions.jbl']

In [8]:
jbl.dump(dict(zip(sorted(cities), range(1, len(cities) + 1))), "../data/mappings/cities.jbl")

['../data/mappings/cities.jbl']

In [9]:
mm = pd.Series(manufacturers_map, name="manufacturer_id").reset_index().rename(columns={"index": "manufacturer"})
mmm = pd.DataFrame(list(map(lambda m: m.split("_+_"), sorted(models))), columns=["manufacturer", "model"])
models_map = mmm.merge(mm)

In [10]:
models_map["manufacturer_model"] = models_map["manufacturer"] + "_+_" + models_map["model"]
models_map["manufacturer_id_model"] = models_map["manufacturer_id"].astype(str) + "_+_" + models_map["model"]

In [11]:
models_map["manufacturer_model"].nunique(), models_map["manufacturer_id_model"].nunique()

(603, 603)

In [12]:
jbl.dump(dict(zip(sorted(models), range(1, len(models) + 1))), "../data/mappings/models.jbl")

['../data/mappings/models.jbl']

In [13]:
parts_of_day_map = {
    "morning": 1,
    "day": 2,
    "evening": 3,
    "night": 4,
}

In [14]:
jbl.dump(parts_of_day_map, "../data/mappings/parts_of_day.jbl")

['../data/mappings/parts_of_day.jbl']

# Легкие партиции

In [15]:
prepare_sessions_ = ft.partial(
    prep.prepare_sessions,
    regions_mapping=jbl.load("../data/mappings/regions.jbl"),
    cities_mapping=jbl.load("../data/mappings/cities.jbl"),
    manufacturers_mapping=jbl.load("../data/mappings/manufacturers.jbl"),
    models_mapping=jbl.load("../data/mappings/models.jbl"),
    parts_of_day_mapping=jbl.load("../data/mappings/parts_of_day.jbl"),
)

In [17]:
parts_path = "../data/raw/competition_data_final_pqt/"

for part in tqdm(os.listdir(parts_path)):
    if not part.endswith(".parquet"):
        continue
    i = int(part.split("-")[1])
    p = prepare_sessions_(os.path.join(parts_path, part))
    p.write_parquet(f"../data/processed/sessions/part-{i}.parquet")

100%|███████████████████████████████████████████| 11/11 [03:08<00:00, 17.15s/it]


In [18]:
%%time
df = pl.read_parquet("../data/processed/sessions/*")

CPU times: user 39.5 s, sys: 19.9 s, total: 59.4 s
Wall time: 14 s


In [19]:
df.estimated_size("gb")

12.841251781210303

In [20]:
df.write_parquet("../data/processed/sessions.pq")

In [21]:
!ls -lh ../data/processed

total 3.6G
drwxrwxr-x 2 ababkin ababkin 4.0K Mar 12 15:18 sessions
-rw-rw-r-- 1 ababkin ababkin 1.7G Feb 18 16:30 sessions-noncat.pq
-rw-rw-r-- 1 ababkin ababkin 1.9G Mar 12 15:19 sessions.pq
-rw-rw-r-- 1 ababkin ababkin 1.7M Feb 17 11:30 test-split.csv
-rw-rw-r-- 1 ababkin ababkin 885K Feb 17 19:25 test-users.pq
-rw-rw-r-- 1 ababkin ababkin 2.0M Feb 17 19:24 train-users.pq
-rw-rw-r-- 1 ababkin ababkin 4.7M Feb 19 07:52 urls.csv


# Подготовка таргета

In [20]:
train = prep.prepare_train("../data/raw/public_train.pqt")
train.write_parquet("../data/processed/train.pq")

In [21]:
test = prep.prepare_test("../data/raw/submit_2.pqt")
test.write_parquet("../data/processed/test.pq")