In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from utils import *

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import gc, os, nltk, string, re

from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Prepare dataset

In [3]:
tr_index = load_obj('tr_index')
te_index = load_obj('te_index')
text = load_obj('daset_text')

load_obj tr_index started
load_obj tr_index: 0.32724881172180176
load_obj te_index started
load_obj te_index: 0.10298418998718262
load_obj daset_text started
load_obj daset_text: 5.469642877578735


In [4]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')
stop_words.extend(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
if not os.path.exists(root + 'features/text_tfidf0_feature_names.pkl'):
    text_tfidf0 = TfidfVectorizer(max_features=50000, stop_words=stop_words)
    with timeit('text_tfidf0 fit'):
        text_tfidf0.fit(text.sample(frac=.7))
    with timeit('text_tfidf0 transform'):
        tr_text_tfidf0_wvs = text_tfidf0.transform(text.loc[tr_index])
        te_text_tfidf0_wvs = text_tfidf0.transform(text.loc[te_index])
    dump_obj(text_tfidf0.get_feature_names(), 'text_tfidf0_feature_names')
    dump_obj(tr_text_tfidf0_wvs,'tr_text_tfidf0_wvs')
    dump_obj(te_text_tfidf0_wvs,'te_text_tfidf0_wvs')

    del text_tfidf0, tr_text_tfidf0_wvs, te_text_tfidf0_wvs
    gc.collect()

In [6]:
if not os.path.exists(root + 'features/text_tfidf1_feature_names.pkl'):
    text_tfidf1 = TfidfVectorizer(max_features=50000, stop_words=stop_words, norm='l1', sublinear_tf=True)
    with timeit('text_tfidf1 fit'):
        text_tfidf1.fit(text.sample(frac=.7))

    with timeit('text_tfidf1 transform'):
        tr_text_tfidf1_wvs = text_tfidf1.transform(text.loc[tr_index])
        te_text_tfidf1_wvs = text_tfidf1.transform(text.loc[te_index])

    dump_obj(text_tfidf1.get_feature_names(), 'text_tfidf1_feature_names')
    dump_obj(tr_text_tfidf1_wvs,'tr_text_tfidf1_wvs')
    dump_obj(te_text_tfidf1_wvs,'te_text_tfidf1_wvs')

    del text_tfidf1, tr_text_tfidf1_wvs, te_text_tfidf1_wvs
    gc.collect()

In [7]:
punct = set(string.punctuation)
emoji = load_obj('emoji')

load_obj emoji started
load_obj emoji: 0.0005438327789306641


In [8]:
rule_1  = re.compile("\d+х\d+х\d+")
rule_2  = re.compile("\d+х\d+")
rule_3  = re.compile("\d*[-|–]\d*")
rule_4  = re.compile("\d*\\.\d*")
rule_5  = re.compile("([^\W\d_]+)(\d+)")
rule_6  = re.compile("(\d+)([^\W\d_]+)")
rule_7  = re.compile("\d+\\/\d|\d+-к|\d+к|\\.\/|\d+х\d+х\d+|\d+х\d+")
rule_8  = re.compile("\\s+")
rule_9  = re.compile("([nn\\s]+)")
rule_10 = re.compile("\d+nn")
rule_11 = re.compile("\d+n")

def normalize_text(s):
    s = s.lower()
    s = rule_1.sub('nxnxn ', s)
    s = rule_2.sub('nxn ', s)
    s = rule_3.sub('nn ', s)
    s = rule_4.sub('n ', s)
    s = rule_5.sub(lambda m: 'n' + m.group(1) + ' ', s)
    s = rule_6.sub(lambda m: 'n' + m.group(2) + ' ', s)
    s = rule_7.sub(' ', s)

    s = ''.join([c if c.isalpha() or c.isalnum() or c.isspace() else ' ' for c in s 
                 if s not in emoji and s not in punct and not s.isnumeric()])
    s = rule_8.sub(' ', s)
    s = rule_9.sub('nn ', s)
    s = rule_10.sub('nn ', s)
    s = rule_11.sub('n ', s)
    s = s.strip()
    words = [w for w in s.split(' ') if w not in stopwords]
    return ' '.join(words)

In [9]:
if not os.path.exists(root + 'features/normalized_text.pkl'):
    with timeit('normalize_text'):
        normalized_text = text.apply(normalize_text)
        dump_obj(normalized_text,'normalized_text')
        del normalized_text
        gc.collect()

In [10]:
normalized_text = load_obj('normalized_text')
for i, s in enumerate(normalized_text[:10].values):
    print(i, s)

load_obj normalized_text started
load_obj normalized_text: 5.66666316986084
0 самараnn детскаяnn одеждаnn иnn обувьnn дляnn мальчиковnn зимнийnn комбенизонnn зимнийnn комбенизонnn дляnn малыша
1 братскnn телефоныnn аксессуарыnn чехолnn дляnn айфонаnn snn твёрдыйnn пластик
2 краснокамскnn одеждаnn обувьnn аксессуарыnn женскаяnn одеждаnn продамnn кожаннуюnn курткуnn
3 оренбургnn товарыnn дляnn животныхnn продамnn игрушкиnn шлейкуnn продамnn ценаnn указанаnn заnn всеnn чтоnn наnn фотоnn всеnn вnn идеальномnn состояниеnn еслиnn объявленияnn наnn сайтеnn значитnn актуальноnn
4 ярославльnn одеждаnn обувьnn аксессуарыnn мужскаяnn одеждаnn спортивныйnn костюмnn adidasnn спортивныйnn костюмnn размерnn nn  nn  малоnn бnn уnn небольшойnn торгnn
5 иркутскnn одеждаnn обувьnn аксессуарыnn женскаяnn одеждаnn плащnn
6 устьnn ордынскийnn телефоныnn samsunn gnn samsuнгnn enn телефонnn экранnn разрешениеnn xnn безnn камерыnn безnn слотаnn дляnn картnn памятиnn аккумуляторnn nn  маnn чnn весnn nn  гnn шxв

In [11]:
if not os.path.exists(root + 'features/text_tfidf2_feature_names.pkl'):
    text_tfidf2 = TfidfVectorizer(max_features=50000)
    with timeit('text_tfidf2 fit'):
        text_tfidf2.fit(normalized_text.sample(frac=.7))
    with timeit('text_tfidf2 transform'):
        tr_text_tfidf2_wvs = text_tfidf2.transform(normalized_text.loc[tr_index])
        te_text_tfidf2_wvs = text_tfidf2.transform(normalized_text.loc[te_index])
    dump_obj(text_tfidf2.get_feature_names(), 'text_tfidf2_feature_names')
    dump_obj(tr_text_tfidf2_wvs,'tr_text_tfidf2_wvs')
    dump_obj(te_text_tfidf2_wvs,'te_text_tfidf2_wvs')

    del text_tfidf2, tr_text_tfidf2_wvs, te_text_tfidf2_wvs
    gc.collect()

In [12]:
if not os.path.exists(root + 'features/text_tfidf3_feature_names.pkl'):
    text_tfidf3 = TfidfVectorizer(max_features=50000, stop_words=stop_words, norm='l1', sublinear_tf=True)
    with timeit('text_tfidf3 fit'):
        text_tfidf3.fit(normalized_text.sample(frac=.7))
    with timeit('text_tfidf3 transform'):
        tr_text_tfidf3_wvs = text_tfidf3.transform(normalized_text.loc[tr_index])
        te_text_tfidf3_wvs = text_tfidf3.transform(normalized_text.loc[te_index])
    dump_obj(text_tfidf3.get_feature_names(), 'text_tfidf3_feature_names')
    dump_obj(tr_text_tfidf3_wvs,'tr_text_tfidf3_wvs')
    dump_obj(te_text_tfidf3_wvs,'te_text_tfidf3_wvs')

    del text_tfidf3, tr_text_tfidf3_wvs, te_text_tfidf3_wvs
    gc.collect()

## Params text feature

In [13]:
params = load_obj('daset_params')

load_obj daset_params started
load_obj daset_params: 1.867748498916626


In [14]:
params.head(5)

item_id
3b896605f03c    для мальчиков верхняя одежда 86-92 см (1-2 года)
c81c34d3aac5                аксессуары чехлы и плёнки исключение
a8ab6225dd06             женская одежда верхняя одежда 44–46 (m)
20bf5eea7fb5                    исключение исключение исключение
e1cf6b9d5789                    мужская одежда другое исключение
Name: params, dtype: object

In [15]:
if not os.path.exists(root + 'features/param_vec0_feature_names.pkl'):
    param_cvec0 = CountVectorizer(max_features=1000, analyzer='word')
    with timeit('param_cvec1 fit'):
        param_cvec0.fit(params.sample(frac=.7))
    with timeit('param_cvec1 transform'):
        tr_param_cvec0_wvs = param_cvec0.transform(params.loc[tr_index])
        te_param_cvec0_wvs = param_cvec0.transform(params.loc[te_index])
    dump_obj(param_cvec0.get_feature_names(), 'param_vec0_feature_names')
    dump_obj(tr_param_cvec0_wvs,'tr_param_cvec0_wvs')
    dump_obj(te_param_cvec0_wvs,'te_param_cvec0_wvs')

    del param_cvec0, tr_param_cvec0_wvs, te_param_cvec0_wvs
    gc.collect()

In [16]:
if not os.path.exists(root + 'features/param_vec1_feature_names.pkl'):
    param_cvec1 = CountVectorizer(max_features=80, analyzer='char')
    with timeit('param_cvec1 fit'):
        param_cvec1.fit(params.sample(frac=.7))
    with timeit('param_cvec1 transform'):
        tr_param_cvec1_cvs = param_cvec1.transform(params.loc[tr_index])
        te_param_cvec1_cvs = param_cvec1.transform(params.loc[te_index])
    dump_obj(param_cvec1.get_feature_names(), 'param_vec1_feature_names')
    dump_obj(tr_param_cvec1_cvs,'tr_param_cvec1_cvs')
    dump_obj(te_param_cvec1_cvs,'te_param_cvec1_cvs')

    del param_cvec1, tr_param_cvec1_cvs, te_param_cvec1_cvs
    gc.collect()