In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer
from matplotlib import pyplot as plt
import seaborn as sns
import xgboost as xgb
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import Ridge

%matplotlib inline

In [47]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [48]:
train = pd.read_csv("../data/Avito Demand Prediction/train.csv", parse_dates = ["activation_date"])
test = pd.read_csv("../data/Avito Demand Prediction/test.csv", parse_dates = ["activation_date"])

## Заполняем пропуски

In [49]:
train["param_1"].fillna("", inplace=True)
train["param_2"].fillna("", inplace=True)
train["param_3"].fillna("", inplace=True)
train["description"].fillna("", inplace=True)
train["title"].fillna("", inplace=True)

test["param_1"].fillna("", inplace=True)
test["param_2"].fillna("", inplace=True)
test["param_3"].fillna("", inplace=True)
test["description"].fillna("", inplace=True)
test["title"].fillna("", inplace=True)



## Настройка

In [50]:
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

In [51]:
import re

def no_number_preprocessor(tokens):
    # r = re.sub('(\d)+', 'NUM', tokens.lower())
    r = re.sub('(\d)+', '', tokens.lower())
    return r

analyzer = CountVectorizer(preprocessor=no_number_preprocessor).build_analyzer()
stemmer = SnowballStemmer("russian") 

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopWords = stopwords.words('russian')
len(stopWords)

def stemming(doc):
    return (w if only_roman_chars(w) else stemmer.stem(w) for w in analyzer(doc))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/paperspace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
from nltk import word_tokenize 

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer("russian")
    def __call__(self, articles):
        print('StemTokenizer runs with', len(articles), 'articles')
        return [self.stemmer.stem(t) for t in word_tokenize(articles)]

In [None]:
def process_text_features():
    def tokenize_and_stem(series):
        import nltk
        from nltk import word_tokenize 
        from nltk.corpus import stopwords
        from nltk.stem.snowball import SnowballStemmer
        nltk.download('stopwords')
        stemmer = SnowballStemmer("russian") 
        stopWords = stopwords.words('russian')
        return series.apply(lambda val: " ".join([stemmer.stem(word) for word in word_tokenize(val) if word.lower() not in stopWords and not word.isdigit()]))
    
    def process(column, filename):   
        import ipyparallel as ipp
        rc = ipp.Client()

        step = int(len(column) / 8)
        chunks = [column[x*step:x*step + step] for x in range(0,8)]
        res = rc[:].map_sync(tokenize_and_stem, [chunks[i] for i in range(0, len(chunks))])
        processed = pd.concat(res)
        processed_df = pd.DataFrame(processed)
        processed_df.to_csv(filename)
        print('processed and saved to ', filename)
        
    process(train['title'], 'processed_title_df.csv')
    process(train['description'], 'processed_description_df.csv')
    process(train['param_1'], 'processed_param_1_df.csv')
    process(train['param_2'], 'processed_param_2_df.csv')
    process(train['param_3'], 'processed_param_3_df.csv')
    
    processed_text = pd.concat([
    pd.read_csv('processed_title_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_description_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_1_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_2_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_3_df.csv', index_col='Unnamed: 0')]
    , axis=1)
    
    processed_text.to_csv('processed_text_df.csv')
    print('processed and saved to ', 'processed_text_df.csv')


    

In [54]:
import ipyparallel as ipp
rc = ipp.Client()

            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@pss48awi9')
            or instruct your controller to listen on an external IP.


In [55]:
step = int(len(train['title']) / 8)

chunks = [train['title'][x*step:x*step + step] for x in range(0,8)]

In [56]:
len(chunks), len(chunks[7])

(8, 187928)

In [57]:
res = rc[:].map_sync(process, [chunks[i] for i in range(0, len(chunks))])

In [58]:
processed_title = pd.concat(res)
processed_title_df = pd.DataFrame(processed_title)
processed_title_df.shape
processed_title_df.to_csv('processed_title')

In [59]:
chunks = [train['description'][x*step:x*step + step] for x in range(0,8)]

In [60]:
res = rc[:].map_sync(process, [chunks[i] for i in range(0, len(chunks))])

In [61]:
processed_description = pd.concat(res)
processed_description_df = pd.DataFrame(processed_description)
processed_description_df.shape
processed_description_df.to_csv('processed_description_df.csv')

In [62]:
chunks = [train['param_1'][x*step:x*step + step] for x in range(0,8)]

In [63]:
res = rc[:].map_sync(process, [chunks[i] for i in range(0, len(chunks))])

In [64]:
processed_param_1 = pd.concat(res)
processed_param_1_df = pd.DataFrame(processed_param_1)
processed_param_1_df.shape
processed_param_1_df.to_csv('processed_param_1_df.csv')

In [65]:
chunks = [train['param_2'][x*step:x*step + step] for x in range(0,8)]
res = rc[:].map_sync(process, [chunks[i] for i in range(0, len(chunks))])
processed_param_2 = pd.concat(res)
processed_param_2_df = pd.DataFrame(processed_param_2)
processed_param_2_df.shape
processed_param_2_df.to_csv('processed_param_2_df.csv')

In [66]:
chunks = [train['param_3'][x*step:x*step + step] for x in range(0,8)]
res = rc[:].map_sync(process, [chunks[i] for i in range(0, len(chunks))])
processed_param_3 = pd.concat(res)
processed_param_3_df = pd.DataFrame(processed_param_3)
processed_param_3_df.shape
processed_param_3_df.to_csv('processed_param_3_df.csv')

In [67]:
processed_text = pd.concat([
    pd.read_csv('processed_title_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_description_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_1_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_2_df.csv', index_col='Unnamed: 0'), 
    pd.read_csv('processed_param_3_df.csv', index_col='Unnamed: 0')]
    , axis=1)

  mask |= (ar1 == a)


In [68]:
processed_text.head()

Unnamed: 0,title,description,param_1,param_2,param_3
0,кокоб ( кокон сна ),"кокон сна малыш , пользова меньш месяца.цвет сер",постельн принадлежн,,
1,стойк одежд,"стойк одежд , вешалк . бутик .",друг,,
2,Philips blura,"хорош состоян , домашн кинотеатр blu ra , USB ...","виде , DVD Blu-ra плеер",,
3,автокресл,прод кресл от0-25кг,автомобильн кресл,,
4,"ваз ,",вопрос телефон .,пробег,ваз ( L ),


In [69]:
processed_text.to_csv('processed_text_df.csv')

In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['deal_probability'], axis=1), train['deal_probability'], test_size=0.3, random_state=17)

## Обработка `title`

### HashingVectorizer

In [42]:
%time 

vectorizer = HashingVectorizer(analyzer=stemming, stop_words=stopWords)
title_hash_vect = vectorizer.fit_transform(X_train['title'].append(X_valid['title']))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


In [43]:
title_hash_vect

<1503424x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 4201564 stored elements in Compressed Sparse Row format>

In [44]:
X_train.shape, X_valid.shape

((1052396, 17), (451028, 17))

In [45]:
train_title_hash_vect = title_hash_vect[:X_train.shape[0]]

In [46]:
test_title_hash_vect = title_hash_vect[X_train.shape[0]:]

In [57]:
train_title_hash_vect.shape, test_title_hash_vect.shape


((1052396, 1048576), (451028, 1048576))

In [49]:
clf = Ridge(alpha=1.0, random_state=17)
clf.fit(train_title_hash_vect, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=17, solver='auto', tol=0.001)

In [50]:
preds = clf.predict(test_title_hash_vect)

In [51]:
mean_squared_error(preds, y_valid)

0.05623392121108921

In [56]:
len(clf.coef_)

1048576

### Tf-Idf vectorizer

In [26]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=StemTokenizer(), stop_words=stopWords)),
    ('tfidf', TfidfTransformer()),
    ('clf', Ridge(alpha=1.0, random_state=17)),
])

parameters = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

In [27]:
%time
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train['title'], y_train)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Process ForkPoolWorker-12:
Process ForkPoolWorker-14:
Process ForkPoolWorker-13:
Process ForkPoolWorker-16:
Process ForkPoolWorker-15:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  Fi

KeyboardInterrupt: 

In [44]:
def preprocess(text):
    return [stemmer.stem(word) for word in word_tokenize(text) if word.lower() not in stopWords and not word.isdigit()]

In [45]:
train_title_preprocess = train['title'].apply(preprocess)

In [47]:
train_title_preprocess[10:]

10                      [1-к, квартир, ,, м², ,, 2/2, эт, .]
11                                                   [джинс]
12                            [атлас, контурны, карт, класс]
13                                     [монитор, acer, 18.5]
14                             [прода, щенк, немецк, овчарк]
15                                        [плат, женск, нов]
16                                     [Chevrolet, Lanos, ,]
17                                            [объемн, цифр]
18                                [куртк, весен, (, осен, )]
19                                           [сним, коттедж]
20                      [2-к, квартир, ,, м², ,, 5/5, эт, .]
21                                            [шапк, норков]
22                                          [Ford, Focus, ,]
23                                           [туфл, moschin]
24                                        [таблетк, бравект]
25                                [пинк, па, My, Littl, Pon]
26                      

In [48]:
#vect = CountVectorizer(tokenizer=StemTokenizer(), stop_words=stopWords)
vect = CountVectorizer()
vect.fit(train_title_preprocess)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
TfidfTransformer?