In [1]:
import numpy as np 
import pandas as pd 

from bidict import bidict
from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/yandex-cup-2023-neuroswipe/keys_full.csv
/kaggle/input/yandex-cup-2023-neuroswipe/voc.txt
/kaggle/input/yandex-cup-2023-neuroswipe/train_curves.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/valid_curves.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/keyboards_full.csv
/kaggle/input/yandex-cup-2023-neuroswipe/suggestion_accepted_curves_info.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/accepted_curves.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/valid_curves_info.csv
/kaggle/input/yandex-cup-2023-neuroswipe/train_curves_info.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/suggestion_accepted_curves.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/test_curves.csv
/kaggle/input/yandex-cup-2023-neuroswipe/test_curves.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/valid_curves_info.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/valid_curves.csv
/kaggle/input/yandex-cup-2023-neuroswipe/test_curves_info.parquet
/kaggle/input/yandex-cup-2023-neuroswipe/test_curve

Основные данные

In [2]:
data_dir = '/kaggle/input/yandex-cup-2023-neuroswipe'

In [3]:
vocab = bidict()
with open(os.path.join(data_dir, 'voc.txt'), 'r') as f:
    i = 0
    for word in tqdm(f):
        vocab[word.rstrip('\n')] = i
        i += 1
print(f'Кол-во слов в словаре: {len(vocab)}')

503598it [00:07, 64080.79it/s]

Кол-во слов в словаре: 503598





Главная особенность `bidict`

In [4]:
vocab['русы'], vocab.inverse[385092]

(385092, 'русы')

Посмотри на клавиатуры и кнопки

In [5]:
df_keyboards = pd.read_csv(os.path.join(data_dir, 'keyboards_full.csv'))

In [6]:
df_keyboards.head()

Unnamed: 0,id,internal_keyboard_id,origin,type,width,height
0,0,0,train/valid/test,default,1080,667
1,1,1,train/valid/test,extra,1080,667
2,2,0,suggestion_accepted,android_ru_east_slavic_separate_comma,1080,587
3,3,1,suggestion_accepted,android_ru_east_slavic_separate_comma_number_row,1080,717
4,4,2,suggestion_accepted,android_ru_east_slavic_extra_cyrillic_separate...,1080,662


В `train/valid/test` всего два вида клавиатуры (default и extra), в дополнительных файлах их уже больше (88)

In [7]:
df_keyboards['origin'].value_counts()

origin
accepted               87
suggestion_accepted    50
train/valid/test        2
Name: count, dtype: int64

In [8]:
df_keyboards['type'].nunique()

90

В клавиатурах есть дубликаты по имени, но у них зачастую разный размер

In [9]:
df_keyboards[df_keyboards['type'].duplicated(keep=False)].sort_values(by='type')

Unnamed: 0,id,internal_keyboard_id,origin,type,width,height
51,51,49,suggestion_accepted,android_ru_east_slavic_4row_separate_comma,1080,735
110,110,58,accepted,android_ru_east_slavic_4row_separate_comma,1080,735
96,96,44,accepted,android_ru_east_slavic_4row_separate_comma_num...,1080,894
50,50,48,suggestion_accepted,android_ru_east_slavic_4row_separate_comma_num...,908,672
47,47,45,suggestion_accepted,android_ru_east_slavic_birmanru_extra_cyrillic...,1080,718
...,...,...,...,...,...,...
70,70,18,accepted,android_ru_east_slavic_ycuk_srow_num_separate_...,1080,774
71,71,19,accepted,android_ru_east_slavic_zap_separate_comma,1080,545
39,39,37,suggestion_accepted,android_ru_east_slavic_zap_separate_comma,1080,597
30,30,28,suggestion_accepted,android_ru_east_slavic_zap_separate_comma_numb...,1080,880


In [10]:
df_keys = pd.read_csv(os.path.join(data_dir, 'keys_full.csv'))

In [11]:
df_keys.head()

Unnamed: 0,keyboard_id,label,is_action,hitbox_x,hitbox_y,hitbox_w,hitbox_h
0,0,й,False,0,15,99,154
1,0,ц,False,98,15,99,154
2,0,у,False,196,15,100,154
3,0,к,False,295,15,99,154
4,0,е,False,393,15,99,154


Самые "популярные" буквы - в,ь,т,с,л,п,б,к,е,з,ш,н.

In [12]:
df_keys['label'].value_counts().head(25)

label
￻     137
�     137
\n    137
      137
.     135
￿     134
,     125
в     107
ь     107
т     107
с     107
л     107
п     107
б     107
к     107
е     107
з     107
ш     107
н     107
я     106
д     105
г     103
ф     103
м     103
о     103
Name: count, dtype: int64

Ч - самая притесняемая

In [13]:
df_keys['label'].value_counts().head(51).tail(25)

label
р    103
и    102
у    101
а    101
ж     97
х     95
ю     82
?     81
й     81
ы     80
э     79
ч     78
￶     77
1     75
3     75
0     75
9     75
8     75
7     75
6     75
/     75
5     75
4     75
2     75
!     72
Name: count, dtype: int64

Читаем файлы через `read_parquet`

In [14]:
%%time
df_train_curves = pd.read_parquet(os.path.join(data_dir, 'train_curves.parquet'))
df_train_curves_info = pd.read_parquet(os.path.join(data_dir, 'train_curves_info.parquet'))

CPU times: user 8.27 s, sys: 13.7 s, total: 21.9 s
Wall time: 12.7 s


In [15]:
df_train_curves_info.head()

Unnamed: 0,curve_id,keyboard_id,word_index
0,0,0,399201
1,1,0,286306
2,2,0,360441
3,3,0,150496
4,4,0,150496


In [16]:
df_train_curves_info.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   curve_id     int32
 1   keyboard_id  int32
 2   word_index   int32
dtypes: int32(3)
memory usage: 68.7 MB


In [17]:
df_train_curves.head()

Unnamed: 0,curve_id,x,y,t
0,0,306,398,0
1,0,306,398,7
2,0,307,398,24
3,0,316,395,62
4,0,337,391,64


In [18]:
df_train_curves.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285892335 entries, 0 to 285892334
Data columns (total 4 columns):
 #   Column    Dtype
---  ------    -----
 0   curve_id  int32
 1   x         int32
 2   y         int32
 3   t         int32
dtypes: int32(4)
memory usage: 4.3 GB


Дополнительные файлы

In [19]:
%%time
df_acc_curves = pd.read_parquet(os.path.join(data_dir, 'accepted_curves.parquet'))
df_acc_curves_info = pd.read_parquet(os.path.join(data_dir, 'accepted_curves_info.parquet'))
df_sug_curves = pd.read_parquet(os.path.join(data_dir, 'suggestion_accepted_curves.parquet'))
df_sug_curves_info = pd.read_parquet(os.path.join(data_dir, 'suggestion_accepted_curves_info.parquet'))

CPU times: user 23.5 s, sys: 34.6 s, total: 58.1 s
Wall time: 35.6 s


In [20]:
df_acc_curves.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591472683 entries, 0 to 591472682
Data columns (total 4 columns):
 #   Column    Dtype
---  ------    -----
 0   curve_id  int32
 1   x         int32
 2   y         int32
 3   t         int32
dtypes: int32(4)
memory usage: 8.8 GB


In [21]:
df_acc_curves_info.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17584462 entries, 0 to 17584461
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   curve_id     int32
 1   keyboard_id  int32
 2   word_index   int32
dtypes: int32(3)
memory usage: 201.2 MB


In [22]:
df_sug_curves.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180095456 entries, 0 to 180095455
Data columns (total 4 columns):
 #   Column    Dtype
---  ------    -----
 0   curve_id  int32
 1   x         int32
 2   y         int32
 3   t         int32
dtypes: int32(4)
memory usage: 2.7 GB


In [23]:
df_sug_curves_info.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580884 entries, 0 to 2580883
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   curve_id     int32
 1   keyboard_id  int32
 2   word_index   int32
dtypes: int32(3)
memory usage: 29.5 MB


Сколько общих слов между `suggestion_accepted` и `accepted`

In [24]:
common_words = np.intersect1d(
    df_acc_curves_info['word_index'].unique(),
    df_sug_curves_info['word_index'].unique()
)
len(common_words)

163782