In [16]:
# coding: utf-8
import pandas as pd
from gensim.models import Word2Vec
import gc
import logging
from pymystem3 import Mystem
import string
import pickle
import nltk
import re
import multiprocessing as mp

nltk.download('punkt')

logging.basicConfig(level=logging.INFO)

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
root = '/kaggle/competitions/avito-demand-prediction/'

In [5]:
def dump_matrix(matrix, name):
    pickle.dump(matrix, open(root + 'features/{}.pkl'.format(name), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)    

def load_matrix(name):
    return pickle.load(open(root + 'features/{}.pkl'.format(name), 'rb'))

In [7]:
stopwords = nltk.corpus.stopwords.words('russian')
punctuation = set(string.punctuation)
emoji = load_matrix('emoji')
chuncksize = 100000
usecols = ['param_1', 'param_2', 'param_3', 'title', 'description']
model = Word2Vec(size=100, window=5, max_vocab_size=500000)
update = False

In [31]:
rule_1 = re.compile("\d+х\d+х\d+")
rule_2 = re.compile("\d+х\d+")
rule_3 = re.compile("\d*[-|–]\d*")
rule_4 = re.compile("\d*\\.\d*")
rule_5 = re.compile("([^\W\d_]+)(\d+)")
rule_6 = re.compile("(\d+)([^\W\d_]+)")
rule_7 = re.compile("\d+\\/\d|\d+-к|\d+к|\\.\/|\d+х\d+х\d+|\d+х\d+")
rule_8 = re.compile("\\s+")
rule_9 = re.compile("([nn\\s]+)")

In [32]:
def normalize_text(s):
    s = rule_1.sub('nxnxn ', s)
    s = rule_2.sub('nxn ', s)
    s = rule_3.sub('nn ', s)
    s = rule_4.sub('n ', s)
    s = rule_5.sub(lambda m: 'n' + m.group(1) + ' ', s)
    s = rule_6.sub(lambda m: 'n' + m.group(2) + ' ', s)
    s = rule_7.sub(' ', s.lower())
    
    s = ''.join([c if c.isalpha() or c.isalnum() or c.isspace() else ' ' for c in s if s not in emoji and s not in punctuation and not s.isnumeric()])
    s = rule_8.sub(' ', s)
    s = rule_9.sub('nn ', s)
    s = s.strip()
    words = [w for w in s.split(' ') if w not in stopwords]
    return ' '.join(words)

In [36]:
def prepare_text(file):
    sentences_file = root+'features/{}_sentences.txt'.format(file)
    print('prepare_text {}'.format(sentences_file))
    i = 0
    with open(sentences_file, 'w') as sf:
        for df in pd.read_csv(root + file + '.csv.zip', chunksize=10000, usecols=usecols):
            df['text'] = df['param_1'].str.cat([df.param_2, df.param_3, df.title, df.description], sep=' ', na_rep='')
            sentences = df['text'].apply(normalize_text).values
            sf.writelines(sentences)
            i += len(sentences)
            del sentences, df
            gc.collect()
            print('Write {}W text'.format(i // 10000))
    print(file+" complated")

In [10]:
def fit_w2v(sentences_file):
    global update
    with open(sentences_file, 'r') as sf:
        sentences = sf.readlines(100000)
        fit_w2v(sentences, update)
        sentences = sentences.split()
        model.build_vocab(sentences, update=update)
        model.train(sentences, total_examples=model.corpus_count, epochs=3)
        update = True

In [None]:
files = ['train', 'test', 'train_active']
for file in files:
    for file in files:
        p = mp.Process(target=prepare_text, args=(file,))
        p.start()
    p.join()
    print('prepare_text completed')
    
    sentences_file = root+'features/{}_sentences.txt'.format(file)
    for k in range(10):
        print(20 * '=' + 'Epoch {}, File {}'.format(k, file) + 20 * '=')
        fit_w2v(sentences_file)
    print(30 * '=' + '{} train finished'.format(file) + '=' * 30)
    model.save(root+'/features/avito.w2v')
    print('Finished')