# Avito ads classification

Denisov Ilia

In [70]:
import pandas as pd
import numpy as np
# import keras
# from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import pickle
import pymorphy2
from sklearn.utils import shuffle
from keras_tqdm import TQDMNotebookCallback
from tqdm import tqdm_notebook, tqdm, tqdm_gui
from keras.models import load_model
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# load dataframes
train_df = pd.read_csv('data/train.csv')
categories = pd.read_csv('data/category.csv')

In [8]:
print("Lengths of titles")
print(pd.Series([len(i.split()) for i in train_df.title]).describe())
print("Lengths of descriptions")
print(pd.Series([len(i.split()) for i in train_df.description]).describe())

Lengths of titles
count    489517.000000
mean          3.826913
std           1.974856
min           1.000000
25%           2.000000
50%           4.000000
75%           5.000000
max          19.000000
dtype: float64
Lengths of descriptions
count    489517.000000
mean         38.276783
std          50.858614
min           1.000000
25%          10.000000
50%          20.000000
75%          44.000000
max         646.000000
dtype: float64


In [4]:
# tokenization and every word to its normal form 
split_into_tokens = lambda text : re.sub("[^\w]", " ",  text).split()
morph = pymorphy2.MorphAnalyzer()
def normalize(text):
    normal_words = [morph.parse(word)[0].normal_form for word in split_into_tokens(text)]
    return normal_words

In [40]:
# normalize and save descriptions
# normal_descrs = []
# for i in tqdm(train_df.description):
#     normal_descrs.append(normalize(i))
#pickle.dump(normal_descrs, open('dump/normalized_descrs.pckl', 'wb'))

# normalize and save titles
# normal_titles = []
# for i in tqdm(train_df.title):
#     normal_titles.append(normalize(i))
# pickle.dump(normal_titles, open('dump/normalized_titles.pckl', 'wb'))   

100%|█████████████████████████████████████████████████████████████████████████| 243166/243166 [29:00<00:00, 145.50it/s]
100%|████████████████████████████████████████████████████████████████████████| 243166/243166 [02:04<00:00, 1949.06it/s]


In [42]:
# load, convert array of text arrays to array of strings and concat(titles, desctiptions)
normal_descrs = pickle.load(open('dump/normalized_descrs.pckl', 'rb'))
normal_titles = pickle.load(open('dump/normalized_titles.pckl', 'rb'))
text_arrays = [i+j for i, j in zip(normal_titles, normal_descrs)]
texts = [' '.join(i) for i in text_arrays]
del normal_descrs, normal_titles


In [133]:
# u can load fitted tokenizer
tk = pickle.load(open('dump/tk_v3.pckl', 'rb'))

In [69]:
# enum every word in collection
nb_words = 150000
tk = Tokenizer(num_words=nb_words)
print('Fitting tokenizer.')
tk.fit_on_texts(texts)
# dump tokenizer
pickle.dump(tk, open('dump/tk_v3.pckl', 'wb'))
print('Tokenizer was fitted.')
# encode every word in texts with a number
X = tk.texts_to_sequences(texts)
# create one hot arrays for fitting
Y = np.array(pd.get_dummies(train_df.category_id))
# pad sequences to the same length for NN
max_descr_length = 620
X = sequence.pad_sequences(X, maxlen=max_descr_length)

Fitting tokenizer.
Tokenizer was fitted.


In [119]:
print('Vocabular size - {}. Using {} most frequent words.'.format(len(tk.word_counts), nb_words))

Vocabular size - 341824. Using 150000 most frequent words.


In [96]:
# split of train sample for train and validation samples
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# fitting took about 6 hours on NVIDIA GTX1060 (1 hour/epoch)
# load fitted model
model = load_model('dump/model_v3.keras')

In [72]:
# fitting of NN
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(nb_words, embedding_vecor_length, input_length=max_descr_length))
model.add(LSTM(128))
model.add(Dense(len(set(train_df.category_id)), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, validation_split=0.05, shuffle=True, epochs=6, batch_size=64, verbose=0, callbacks=[TQDMNotebookCallback()])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 620, 32)           4800000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 54)                6966      
Total params: 4,889,398
Trainable params: 4,889,398
Non-trainable params: 0
_________________________________________________________________
None


371968/|/[loss: 2.139, acc: 0.419] 100%|| 371968/372032 [58:30<00:00, 108.00it/s]                                      

371968/|/[loss: 0.560, acc: 0.844] 100%|| 371968/372032 [58:07<00:00, 107.05it/s]                                      

371968/|/[loss: 0.374, acc: 0.889] 100%|| 371968/372032 [58:07<00:00, 107.35it/s]                                      

371968/|/[loss: 0.298, acc: 0.909] 100%|| 371968/372032 [58:08<00:00, 107.41it/s]                                      

371968/|/[loss: 0.245, acc: 0.924] 100%|| 371968/372032 [58:01<00:00, 106.01it/s]                                      

371968/|/[loss: 0.203, acc: 0.936] 100%|| 371968/372032 [59:23<00:00, 104.85it/s]                                      


<keras.callbacks.History at 0x2992b587e10>

In [None]:
model.save('dump/model_v2.keras')

## Validation result

In [94]:
y_prob = model.predict(x_test)
y_pred = np.argmax(y_prob, axis=1)
y_test_categories = np.argmax(y_test, axis=1)
print('Validation accuracy on 54 categories is {:0.2f}'.format(accuracy_score(y_test_categories, y_pred)))

Validation accuracy on 54 categories is 0.88


In [87]:
# load new categories for different levels, look at "categories_transform.ipynb"
cats = pd.read_csv('new_categories.csv')
cats.head()

Unnamed: 0,category_id,depth,lvl0_label,lvl1_label,lvl2_label,lvl3_label
0,0,3,0,19,4,0
1,1,2,0,11,0,0
2,2,3,0,19,2,0
3,3,3,0,14,28,0
4,4,3,0,4,19,0


In [63]:
# make category_id to lvl{}_label conversion
category2lvl0_label = dict(list(zip(cats.category_id, cats.lvl0_label)))
category2lvl1_label = dict(list(zip(cats.category_id, cats.lvl1_label)))
category2lvl2_label = dict(list(zip(cats.category_id, cats.lvl2_label)))
category2lvl3_label = dict(list(zip(cats.category_id, cats.lvl3_label)))

In [93]:
for i, d in enumerate([category2lvl0_label, category2lvl1_label, category2lvl2_label, category2lvl3_label]):
    y_pred0 = np.vectorize(d.get)(y_pred)
    y_test0 = np.vectorize(d.get)(y_test_categories)
    acc = accuracy_score(y_test0, y_pred0)
    print('Accuracy for lvl {} categories is {:0.2f}'.format(i, acc))

Accuracy for lvl 0 categories is 0.96
Accuracy for lvl 1 categories is 0.94
Accuracy for lvl 2 categories is 0.89
Accuracy for lvl 3 categories is 0.98


## Classify test.csv

In [None]:
test_df = pd.read_csv('data/test.csv')

In [None]:
# normalize and save test descriptions
# normal_test_descrs = []
# for i in tqdm(test_df.description):
#     normal_test_descrs.append(normalize(i))
# pickle.dump(normal_test_descrs, open('dump/normalized_test_descrs.pckl', 'wb'))

# normalize and save test titles
# normal_test_titles = []
# for i in tqdm(test_df.title):
#     normal_test_titles.append(normalize(i))
# pickle.dump(normal_test_titles, open('dump/normalized_test_titles.pckl', 'wb'))    

In [105]:
normal_test_descrs = pickle.load(open('dump/normalized_test_descrs.pckl', 'rb'))
normal_test_titles = pickle.load(open('dump/normalized_test_titles.pckl', 'rb'))
test_text_arrays = [i+j for i, j in zip(normal_test_titles, normal_test_descrs)]
test_texts = [' '.join(i) for i in test_text_arrays]
del normal_test_descrs, normal_test_titles

In [111]:
X_test = tk.texts_to_sequences(test_texts)
# pad sequences to the same length for NN
X_test = sequence.pad_sequences(X_test, maxlen=max_descr_length)

In [112]:
test_categories_probs = model.predict(X_test)

In [130]:
test_categories = np.argmax(test_categories_probs, axis=1)

In [132]:
test_df = test_df.assign(category_id=test_categories)
test_df = test_df.assign(category=np.array(categories.loc[test_categories].name))
test_df.to_csv('answer/answer_extended.csv')
test_df[['item_id', 'category_id']].to_csv('answer/answer.csv')