[Описание задания](https://docs.google.com/document/d/19L0AkNF9HOvHquUBlNCCwuuWBymVqrEtlburUzjbya8/edit)


[форма подачи заявки на стажировку](https://start.avito.ru/)

In [0]:
# подключаем гугл диск
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pandas as pd
import numpy as np
import os

from xgboost import  XGBClassifier
import joblib # for saving xgb model

# statistics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc

from tqdm import tqdm
tqdm.pandas()

# for generating text features
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


DATA_PATH = '/content/gdrive/My Drive/avito'            # в этой папке лежат данные и весь код
FASTTEXT_PATH= os.path.join(DATA_PATH, "fastText")      # папка, куда скачивается fastText

# Обучаем fasttext

In [0]:
%%shell
# нужно сначала удалить старую папку fasttext, иначе не сработает
path='/content/gdrive/My Drive/avito/fastText'
git clone https://github.com/facebookresearch/fastText.git "$path"
cd "$path"
sudo make
sudo pip3 install .

In [0]:
import fastText

In [9]:
%%shell

head -n 398400 /content/gdrive/My\ Drive/avito/fasttext_input.txt > /content/gdrive/My\ Drive/avito/fastText/fasttext_input.train.txt
tail -n 91117 /content/gdrive/My\ Drive/avito/fasttext_input.txt > /content/gdrive/My\ Drive/avito/fastText/fasttext_input.val.txt



In [0]:
ff_model = fastText.FastText
ff_model = ff_model.train_supervised(input=os.path.join(FASTTEXT_PATH, "fasttext_input.train.txt"))

In [0]:
def make_fasttext_predictions(file_in: str, file_out: str):
  """
  Записывает в файл file_out предсказания fasttext по данным из файла file_in
  
  формат выходного файла: первые k столбцов - id категорий в порядке убывания их 
  вероятностей, предсказанных fastText, за ними k столбцов веростностей.
  Т.е. (k+1)й столбец - это веротность для категории в 1м столбце - (k+2) - для 
  категории из 2го столбца
  
  Затем файл file_out можно загрузить в pandas DataFrame 
  """
    preds = []
    with open(os.path.join(DATA_PATH, file_in), "r") as f:
        for line in tqdm(f):
          line = line.strip(' \n')
          
          # предсказывает только топ 5 самых вероятных категорий
          pred, prob = ff_model.predict(line, k=5)
          
          # 
          pred = [int(i[9:]) for i in pred]
          pred.extend([round(i, 3) for i in prob])
          preds.append(pred)


    df_preds = pd.DataFrame(preds)
    df_preds.to_csv(os.path.join(DATA_PATH, file_out))

In [14]:
make_fasttext_predictions("fastText/fasttext_input.train.txt", "5_preds_of_fasttext.train.csv")
make_fasttext_predictions("fastText/fasttext_input.val.txt", "5_preds_of_fasttext.val.csv")
make_fasttext_predictions("fasttext_test.txt", "5_preds_of_fasttext.test.csv")

398400it [00:27, 14677.28it/s]
91117it [00:06, 14386.79it/s]
243166it [00:17, 13610.74it/s]


# Делаем фичи для xgboost'a

In [0]:
# избавимся от os.path.join при чтении и записи датафрейма

def read_data(file, **kwargs):
    return pd.read_csv(os.path.join(DATA_PATH, file), **kwargs)
  
def save_data(data, file, **kwargs):
    assert isinstance(data, pd.DataFrame)
    return data.to_csv(os.path.join(DATA_PATH, file), **kwargs)

### train

In [0]:
data = read_data("train.csv")

In [0]:
# https://www.kaggle.com/lalitparihar44/detailed-text-based-feature-engineering

data['length_of_ad'] = data.description.apply(lambda x: len(x))
data['num_of_words'] = data.description.apply(lambda x: len(str(x).split()))
data['average_word_length'] = data.apply(lambda x: x['length_of_ad'] / x['num_of_words'], axis=1)
data['price_of_single_word'] = data.apply(lambda x: x['price'] / x['num_of_words'], axis=1)
data['price_of_single_char'] = data.apply(lambda x: x['price'] / x['length_of_ad'], axis=1)
data['average_price_of_single_word'] = data.apply(lambda x: x['average_word_length'] * x['price_of_single_char'], axis=1)

In [0]:
stop_words = set(stopwords.words('russian'))
data['count_of_stop_words'] = data.description.apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]) )

In [0]:
df_preds_train = read_data("5_preds_of_fasttext.train.csv")
df_preds_val = read_data("5_preds_of_fasttext.val.csv")

In [23]:
df_preds_train.shape, df_preds_val.shape, data.shape

((398400, 11), (91117, 11), (489517, 12))

In [24]:
data_train = data[:df_preds_train.shape[0]]
data_val = data[df_preds_train.shape[0]:]

to_del = ["item_id", "description", "title"]
data_train.drop(to_del, axis=1, inplace=True)
data_val.drop(to_del, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [0]:
data_train = pd.concat([data_train, df_preds_train], axis=1).drop("Unnamed: 0", axis=1)
data_val = pd.concat([data_val.reset_index(drop=True), df_preds_val], axis=1).drop("Unnamed: 0", axis=1)

In [26]:
data_val.head(3)

Unnamed: 0,price,category_id,length_of_ad,num_of_words,average_word_length,price_of_single_word,price_of_single_char,average_price_of_single_word,count_of_stop_words,0,1,2,3,4,5,6,7,8,9
0,5500.0,6,169,30,5.633333,183.333333,32.544379,183.333333,3,6,10,0,9,2,0.999,0.0,0.0,0.0,0.0
1,13500.0,50,125,18,6.944444,750.0,108.0,750.0,4,40,10,31,36,39,0.13,0.114,0.084,0.074,0.061
2,500.0,43,242,32,7.5625,15.625,2.066116,15.625,6,43,51,19,44,20,0.974,0.019,0.001,0.001,0.001


In [27]:
data_train.head(3)

Unnamed: 0,price,category_id,length_of_ad,num_of_words,average_word_length,price_of_single_word,price_of_single_char,average_price_of_single_word,count_of_stop_words,0,1,2,3,4,5,6,7,8,9
0,1000.0,19,26,3,8.666667,333.333333,38.461538,333.333333,0,19,23,27,44,47,0.945,0.053,0.001,0.0,0.0
1,1250.0,22,83,11,7.545455,113.636364,15.060241,113.636364,1,22,16,39,38,37,0.988,0.007,0.001,0.001,0.001
2,13000.0,37,724,102,7.098039,127.45098,17.955801,127.45098,18,37,21,47,24,17,0.799,0.108,0.022,0.02,0.017


In [28]:
data_train.columns

Index(['price', 'category_id', 'length_of_ad', 'num_of_words',
       'average_word_length', 'price_of_single_word', 'price_of_single_char',
       'average_price_of_single_word', 'count_of_stop_words', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9'],
      dtype='object')

In [0]:
save_data(data_train, "xgb_train_data.csv", index=False)
save_data(data_val, "xgb_val_data.csv", index=False)

### test

In [30]:
data_test = read_data("test.csv")
data_test.head(3)

Unnamed: 0,item_id,title,description,price
0,489517,Стоик журнальный сталь,продам журнальный столик изготавливаю столы из...,10000.0
1,489518,iPhone 5 64Gb,"Телефон в хорошем состоянии. Комплект, гаранти...",12500.0
2,489519,Утеплитель,ТЕПЛОПЕЛЕН-ЛИДЕР ТЕПЛА!!! Толщина утеплителя :...,250.0


In [0]:
data_test['length_of_ad'] = data_test.description.apply(lambda x: len(x))
data_test['num_of_words'] = data_test.description.apply(lambda x: len(str(x).split()))
data_test['average_word_length'] = data_test.apply(lambda x: x['length_of_ad'] / x['num_of_words'], axis=1)
data_test['price_of_single_word'] = data_test.apply(lambda x: x['price'] / x['num_of_words'], axis=1)
data_test['price_of_single_char'] = data_test.apply(lambda x: x['price'] / x['length_of_ad'], axis=1)
data_test['average_price_of_single_word'] = data_test.apply(lambda x: x['average_word_length'] * x['price_of_single_char'], axis=1)

stop_words = set(stopwords.words('russian'))
data_test['count_of_stop_words'] = data_test.description.apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]) )

df_preds_test= read_data("5_preds_of_fasttext.test.csv")


In [0]:
to_del = ["item_id", "description", "title"]
data_test.drop(to_del, axis=1, inplace=True)

data_test = pd.concat([data_test, df_preds_test], axis=1).drop("Unnamed: 0", axis=1)

In [33]:
data_test.columns

Index(['price', 'length_of_ad', 'num_of_words', 'average_word_length',
       'price_of_single_word', 'price_of_single_char',
       'average_price_of_single_word', 'count_of_stop_words', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9'],
      dtype='object')

In [34]:
data_test.head(3)

Unnamed: 0,price,length_of_ad,num_of_words,average_word_length,price_of_single_word,price_of_single_char,average_price_of_single_word,count_of_stop_words,0,1,2,3,4,5,6,7,8,9
0,10000.0,168,24,7.0,416.666667,59.52381,416.666667,7,22,23,19,18,20,0.775,0.129,0.07,0.008,0.005
1,12500.0,213,30,7.1,416.666667,58.685446,416.666667,5,0,6,10,2,11,0.542,0.356,0.051,0.03,0.019
2,250.0,1064,133,8.0,1.879699,0.234962,1.879699,25,15,21,25,27,17,0.883,0.06,0.031,0.004,0.003


In [0]:
save_data(data_test, "xgb_test_data.csv", index=False)

# Выделяем иерархии их файла category csv


In [36]:
cat = read_data('category.csv')
cat.head(3)

Unnamed: 0,category_id,name
0,0,Бытовая электроника|Телефоны|iPhone
1,1,Бытовая электроника|Ноутбуки
2,2,Бытовая электроника|Телефоны|Samsung


In [37]:
# https://chrisalbon.com/python/data_wrangling/pandas_expand_cells_containing_lists/

cat_hier = cat.name.apply(lambda x: x.split('|')).apply(pd.Series)
cat_hier = cat_hier.rename(columns = lambda x : 'cat_hier_' + str(x))
cat_hier = pd.concat([cat_hier, cat.category_id], axis=1)
cat_hier.to_csv(os.path.join(DATA_PATH, "cat_hier.csv"), index=False)
cat_hier.head(3)

Unnamed: 0,cat_hier_0,cat_hier_1,cat_hier_2,cat_hier_3,category_id
0,Бытовая электроника,Телефоны,iPhone,,0
1,Бытовая электроника,Ноутбуки,,,1
2,Бытовая электроника,Телефоны,Samsung,,2


# Стакаем xgboost поверх fasttext

In [38]:
data_train = read_data("xgb_train_data.csv")
data_val = read_data("xgb_val_data.csv")
data_val.tail(3)

Unnamed: 0,price,category_id,length_of_ad,num_of_words,average_word_length,price_of_single_word,price_of_single_char,average_price_of_single_word,count_of_stop_words,0,1,2,3,4,5,6,7,8,9
91114,2900.0,25,166,22,7.545455,131.818182,17.46988,131.818182,6,25,23,15,21,18,1.0,0.0,0.0,0.0,0.0
91115,300.0,9,47,6,7.833333,50.0,6.382979,50.0,1,9,2,3,0,6,0.665,0.335,0.0,0.0,0.0
91116,19000.0,50,572,72,7.944444,263.888889,33.216783,263.888889,5,50,52,49,3,30,1.0,0.0,0.0,0.0,0.0


In [0]:
X_train, y_train = data_train.drop('category_id', axis=1).values, data_train.category_id.values
X_val, y_val = data_val.drop('category_id', axis=1).values, data_val.category_id.values

In [0]:
file_saved_xgb = os.path.join(DATA_PATH, "xgb_model_600_est")

In [0]:
xgb = XGBClassifier(n_estimators=600, tree_method='gpu_hist', objective="multi:softprob", num_class=54)
xgb.fit(X_train, y_train)
joblib.dump(xgb, file_saved_xgb) 

In [0]:
xgb = joblib.load(file_saved_xgb)

In [0]:
preds = xgb.predict(X_val)

# Оцениваем качество модели по accuracy

## Выделяем иерархии из файла category csv

In [50]:
cat = read_data('category.csv')
cat.head(3)

Unnamed: 0,category_id,name
0,0,Бытовая электроника|Телефоны|iPhone
1,1,Бытовая электроника|Ноутбуки
2,2,Бытовая электроника|Телефоны|Samsung


In [51]:
# https://chrisalbon.com/python/data_wrangling/pandas_expand_cells_containing_lists/

cat_hier = cat.name.apply(lambda x: x.split('|')).apply(pd.Series)
cat_hier = cat_hier.rename(columns = lambda x : 'cat_hier_' + str(x))
cat_hier = pd.concat([cat_hier, cat.category_id], axis=1)
cat_hier.to_csv(os.path.join(DATA_PATH, "cat_hier.csv"), index=False)
cat_hier.head(3)

Unnamed: 0,cat_hier_0,cat_hier_1,cat_hier_2,cat_hier_3,category_id
0,Бытовая электроника,Телефоны,iPhone,,0
1,Бытовая электроника,Ноутбуки,,,1
2,Бытовая электроника,Телефоны,Samsung,,2


## Считаем accuracy по каждому уровню иерархии

In [0]:
def calc_hier_acc(y_val, preds):
  """
  Возвращает accuracy по каждому уровню иерархии категорий
  
  y_val - массив категорий
  preds - массив предсказанных категорий
  """
  assert isinstance(y_val, np.ndarray) and \
         isinstance(preds, np.ndarray) and \
         len(y_val.shape) == 1 and \
         len(preds.shape) == 1 and \
         y_val.shape == preds.shape
  
  hier_acc = []
  for hier_level in range(1, 4):
    col_name = f'cat_hier_{hier_level}'
    cat = read_data('cat_hier.csv').loc[:, [col_name, 'category_id']]
    
    # https://stackoverflow.com/questions/18695605/python-pandas-dataframe-to-dictionary
    d = cat.set_index('category_id')[col_name].to_dict()
    d = {k: g["category_id"].tolist() for k, g in cat.groupby(col_name)}

    to_return = {}
    for cat_name, cat_id_list in d.items():
      idx = np.argwhere(np.isin(y_val.reshape(-1), cat_id_list)).reshape(-1)
      to_return[cat_name] = np.average(y_val[idx].reshape(-1) == preds[idx].reshape(-1))
      to_return[cat_name] = np.around(to_return[cat_name], 3)

    hier_acc.append(to_return)
    
  return hier_acc

In [77]:
acc_hier = calc_hier_acc(y_val, preds)
acc_hier

[{'Аудио и видео': 0.943,
  'Билеты и путешествия': 0.991,
  'Бытовая техника': 0.961,
  'Велосипеды': 0.939,
  'Игры, приставки и программы': 0.932,
  'Книги и журналы': 0.839,
  'Коллекционирование': 0.795,
  'Красота и здоровье': 0.894,
  'Мебель и интерьер': 0.79,
  'Музыкальные инструменты': 0.905,
  'Настольные компьютеры': 0.952,
  'Ноутбуки': 0.952,
  'Одежда, обувь, аксессуары': 0.843,
  'Охота и рыбалка': 0.877,
  'Планшеты и электронные книги': 0.937,
  'Посуда и товары для кухни': 0.879,
  'Растения': 0.94,
  'Ремонт и строительство': 0.902,
  'Спорт и отдых': 0.806,
  'Телефоны': 0.896,
  'Товары для детей и игрушки': 0.834,
  'Товары для компьютера': 0.951,
  'Часы и украшения': 0.947},
 {'Nokia': 0.917,
  'Samsung': 0.892,
  'Sony': 0.938,
  'iPhone': 0.925,
  'Аксессуары': 0.803,
  'Акустика, колонки, сабвуферы': 0.934,
  'Гитары и другие струнные': 0.931,
  'Двери': 0.959,
  'Детская мебель': 0.729,
  'Детские коляски': 0.921,
  'Для дома': 0.975,
  'Для кухни': 0.948,

In [0]:
import json
with open(os.path.join(DATA_PATH, 'Accuracy по каждому уровню иерархии.txt'), 'w') as outfile:  
    json.dump(acc_hier, outfile)

## Считаем среднюю accuracy и accuracy по category_id без разделения на уровни иерархии

In [0]:
def calc_accuracy(y_test, y_pred):
  """
  Возвращает среднюю accuracy и accuracy по category_id без разделения на 
  уровни иерархии
  
  Формат вывода:
  
  <category_id>: <accuracy>
  """
  t = confusion_matrix(y_test, y_pred)


  q = []
  for i in range(t.shape[0]):
    accuracy_i = t[i, i] / t[:, i].sum()
    accuracy_i = round(accuracy_i, 3)
    q.append(accuracy_i)

  print("mean accuracy")
  print(np.mean(q))

  print("----------")
  print("accuracy by categories\n")

  for idx, i in enumerate(q):
    print(f"{idx}: {i}")

In [75]:
calc_accuracy(y_val, preds)

mean accuracy
0.8750185185185186
----------
accuracy by categories

0: 0.918
1: 0.947
2: 0.884
3: 0.926
4: 0.914
5: 0.955
6: 0.854
7: 0.959
8: 0.934
9: 0.869
10: 0.896
11: 0.942
12: 0.921
13: 0.953
14: 0.967
15: 0.849
16: 0.843
17: 0.864
18: 0.814
19: 0.626
20: 0.841
21: 0.907
22: 0.901
23: 0.504
24: 0.957
25: 0.961
26: 0.99
27: 0.884
28: 0.936
29: 0.959
30: 0.79
31: 0.712
32: 0.908
33: 0.888
34: 0.748
35: 0.945
36: 0.932
37: 0.89
38: 0.686
39: 0.932
40: 0.954
41: 0.854
42: 0.864
43: 0.841
44: 0.609
45: 0.964
46: 0.889
47: 0.682
48: 0.994
49: 0.869
50: 0.953
51: 0.856
52: 0.871
53: 0.945


# Делаем предсказания для тестовой выборки

In [79]:
data_test = read_data("xgb_test_data.csv")
data_test.head(3)

Unnamed: 0,price,length_of_ad,num_of_words,average_word_length,price_of_single_word,price_of_single_char,average_price_of_single_word,count_of_stop_words,0,1,2,3,4,5,6,7,8,9
0,10000.0,168,24,7.0,416.666667,59.52381,416.666667,7,22,23,19,18,20,0.775,0.129,0.07,0.008,0.005
1,12500.0,213,30,7.1,416.666667,58.685446,416.666667,5,0,6,10,2,11,0.542,0.356,0.051,0.03,0.019
2,250.0,1064,133,8.0,1.879699,0.234962,1.879699,25,15,21,25,27,17,0.883,0.06,0.031,0.004,0.003


In [80]:
data_test.columns

Index(['price', 'length_of_ad', 'num_of_words', 'average_word_length',
       'price_of_single_word', 'price_of_single_char',
       'average_price_of_single_word', 'count_of_stop_words', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9'],
      dtype='object')

In [0]:
preds_test = xgb.predict(data_test.values)

In [82]:
t = read_data('test.csv')
t.head(3)

Unnamed: 0,item_id,title,description,price
0,489517,Стоик журнальный сталь,продам журнальный столик изготавливаю столы из...,10000.0
1,489518,iPhone 5 64Gb,"Телефон в хорошем состоянии. Комплект, гаранти...",12500.0
2,489519,Утеплитель,ТЕПЛОПЕЛЕН-ЛИДЕР ТЕПЛА!!! Толщина утеплителя :...,250.0


In [83]:
test_preds = pd.DataFrame([t.item_id.values, preds_test], ).T
test_preds.columns = ['item_id', 'category_id']
test_preds = test_preds.set_index('item_id')
test_preds.head(3)

Unnamed: 0_level_0,category_id
item_id,Unnamed: 1_level_1
489517,22
489518,0
489519,15


In [0]:
assert test_preds.shape[0] == t.shape[0]

In [0]:
save_data(test_preds, "predictions_for_test_data.csv")