In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def preprocess_items(item):
    item = re.sub('""*?""', '', item)
    item = re.sub(r'\([^\)]+\)', '', item)
    item = ' '.join([l for l in item.split() if "№" not in l and "/" != l[0]])
    cpos = len(item)
            
    punctuation =  '\/!@#$%^&*\)\(+_\-<>?,.:;"\''
    item = item[:cpos]
    item_ = ''
    for i in item:
        if ord(i) >= ord ('A') and ord(i) <= ord('z'):
            continue
        item_ += i
    item = item_
    for p in punctuation:
        item = item.replace(p,'')
    item = item.lower().strip()
    return item

In [12]:
train = pd.read_parquet('data_fusion_train.parquet')
df_uni = train[train.category_id!=-1].drop_duplicates('item_name').sample(frac=1).reset_index(drop=True)
Y = df_uni.category_id.to_numpy()
df_uni.item_name = df_uni.item_name.apply(preprocess_items)

In [21]:
train, valid = train_test_split(df_uni,
                                                test_size=0.1, 
                                                stratify=df_uni['category_id'], 
                                                shuffle=True, random_state=10)

In [22]:
tfidf = TfidfVectorizer(analyzer = 'char', ngram_range=(2,5))
tfidf.fit(df_uni.item_name)
X_char = tfidf.transform(df_uni.item_name)
X_char_train = tfidf.transform(train.item_name)
X_char_valid = tfidf.transform(valid.item_name)

In [23]:
clf2 = LinearSVC(C=1.0,
             class_weight='balanced',
             fit_intercept= True,
             loss='squared_hinge',
             multi_class='ovr',
             penalty= 'l2',
             random_state=43,
             tol= 0.001)
clf = OneVsRestClassifier(clf2, n_jobs=6)
print(np.mean(cross_val_score(clf, X_char, Y, cv=10, scoring='f1_weighted')))

0.8349481747516432


In [24]:
clf.fit(X_char_train, train.category_id)

OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced',
                                        random_state=43, tol=0.001),
                    n_jobs=6)

In [25]:
print(classification_report(valid.category_id.values, clf.predict(X_char_valid)))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       236
           1       1.00      1.00      1.00         3
           2       0.94      0.97      0.95        32
           3       0.69      0.82      0.75        11
           4       0.68      0.74      0.71        23
           6       0.38      0.50      0.43         6
           7       0.92      1.00      0.96        22
           9       0.78      0.70      0.74        10
          11       1.00      0.40      0.57         5
          12       0.92      0.61      0.73        18
          13       1.00      0.50      0.67         4
          19       0.86      0.86      0.86         7
          20       0.80      0.67      0.73         6
          24       0.43      0.43      0.43         7
          26       0.00      0.00      0.00         2
          27       1.00      0.25      0.40         4
          29       0.73      0.85      0.79        13
          30       0.83    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle
pickle.dump(tfidf, open('tfidf', 'wb'))
pickle.dump(clf, open('clf_task1', 'wb'))

In [None]:
%%writefile 'script.py'

import re
import pickle
import pandas as pd
import numpy as np
import copy
from scipy.sparse import coo_matrix, hstack
import spacy

def preprocess_items(item):
    old1=copy.deepcopy(item)
    item = re.sub('""*?""', '', item)
    item = re.sub(r'\([^\)]+\)', '', item)
    item = ' '.join([l for l in item.split() if "№" not in l and "/" != l[0]])
    cpos = len(item)
            
    punctuation =  '\/!@#$%^&*\)\(+_\-<>?,.:;"\''
    item = item[:cpos]
    item_ = ''
    for i in item:
        if ord(i) >= ord ('A') and ord(i) <= ord('z'):
            continue
        item_ += i
    item = item_
    for p in punctuation:
        item = item.replace(p,'')
    item = item.lower().strip()
    return item

tfidf = pickle.load(open('tfidf', 'rb'))
clf = pickle.load(open('clf_task1', 'rb'))
test = pd.read_parquet('data/task1_test_for_user.parquet')
test['item_name2'] = test.item_name.apply(preprocess_items)
X = tfidf.transform(test.item_name)
test.loc[test.pred==-1,'pred'] = clf.predict(X)
test[['id','pred']].to_csv("answers.csv",  index=None)