In [1]:
from processing import load_dataset, unique_item_name

In [2]:
train_data_name = 'data/data_fusion_train.parquet'

In [3]:
%%time

train = load_dataset(train_data_name)

CPU times: user 34.3 s, sys: 24.6 s, total: 59 s
Wall time: 54.5 s


In [4]:
train.sample(3)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands,weight,receipt_item_count
12154128,6614924,2,22:53,Хачапури ТЦ по-аджарски 180г,1.0,9,-1,84,,1,1
19802417,5898392,1,11:23,Муфта ПП d20 шт,4.0,2,6,105,,1,12
6535863,5065280,0,16:15,Пакет,1.0,3,1,203,,1,11


In [5]:
%%time

train_unique = unique_item_name(train)

CPU times: user 1.16 s, sys: 214 ms, total: 1.37 s
Wall time: 1.37 s


In [6]:
train_unique.sample(3)

Unnamed: 0,item_name,receipt_dayofweek,item_nds_rate,receipt_item_count,item_quantity,item_price,receipt_id,category_id
10671,"Бокс универсальный С-56 В, 460",4,1,2,2.0,14,6715883,139
24519,Мин.вода ОВК ЕССЕНТУКИ №17 пэт 1.5л,0,1,17,1.0,7,1951447,83
32816,Рассольник Петербургский с мясом (говядина)250...,1,1,6,1.0,7,1627525,71


In [7]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=300000, 
    ngram_range=(3, 5), 
    analyzer="char_wb", 
)
X_train = tfidf.fit_transform(train_unique.item_name)
y_train = train_unique['category_id']

CPU times: user 6.47 s, sys: 5.62 s, total: 12.1 s
Wall time: 4.42 s


In [8]:
%%time

from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_predict, KFold

clf = LinearSVC(C=1)

N_JOBS=8
folds = KFold(N_JOBS, shuffle=True, random_state=0)
predicts = cross_val_predict(clf, X_train, y_train, cv=folds, n_jobs=N_JOBS, method='predict')

CPU times: user 111 ms, sys: 539 ms, total: 650 ms
Wall time: 30.8 s


In [9]:
from sklearn.metrics import f1_score

score = f1_score(y_train, predicts, average='weighted')
f'{score:.3f}'

'0.840'

In [10]:
%%time

_ = clf.fit(X_train, y_train)

CPU times: user 23.5 s, sys: 1.54 s, total: 25 s
Wall time: 25 s


In [11]:
import pickle

pickle.dump(tfidf, open('t1_sub/tfidf', 'wb'))
pickle.dump(clf, open('t1_sub/clf_task1', 'wb'))

In [12]:
!cp processing.py t1_sub/

files_list = [
    'tfidf',
    'clf_task1',
    'script.py',
    'processing.py',
]

In [13]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = 't1_sub/submission.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in files_list:
        zipObj.write(
            f't1_sub/{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

!ls -lh {submission_name}

['tfidf', 'clf_task1', 'script.py', 'processing.py']
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 44M мар 15 13:22 t1_sub/submission.zip


# Debug run

In [14]:
!rm -r test; mkdir test; cp -r t1_sub/data test/data; cd test; unzip -q ../t1_sub/submission.zip; ls -lhS

итого 240M
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 220M мар 15 13:21 clf_task1
-rw-r--r-- 1 dmitry.dremov dmitry.dremov  21M мар 15 13:21 tfidf
drwxr-xr-x 2 dmitry.dremov dmitry.dremov 4,0K мар 15 13:22 data
-rw-r--r-- 1 dmitry.dremov dmitry.dremov  875 мар 15 13:21 processing.py
-rw-r--r-- 1 dmitry.dremov dmitry.dremov  558 мар 15 13:14 script.py


In [15]:
!cd test && python3 script.py && head answers.csv

id,pred
0,77
1,76
2,80
3,82
4,78
5,38
6,71
7,84
8,139
