In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD
from scipy.spatial import distance

Посмотрим на данные.

In [2]:
train = pd.read_csv('/kaggle/input/data-match/train.csv')

Описание данных.
* offer_depersanalised и goods_depersanalised - идентификаторы предложения и товара соответственно
* sum_length - суммарная длина пары названий и атрибутов в символах
* attrs+title_score - вероятность матча от рескоринговой модели
* offer_price и item_price - цена предложения и товара соответственно
* goods_category_id - категория товара
* id - идентификатор пары offer_depersanalised + $ + goods_depersanalised
* target (только в train.csv) - метка класса (0 - не матч, 1 - матч)

In [3]:
train.head()

Unnamed: 0,offer_depersanalised,goods_depersanalised,sum_length,attrs+title_score,offer_price,goods_price,goods_category_id,target,id
0,295140,1396793,37,0.027267,1070,,14.0,0,295140$1396793
1,65291,1396586,38,0.050415,698,,14.0,0,65291$1396586
2,39232,1396244,38,0.08728,837,,14.0,0,39232$1396244
3,39232,1396513,38,0.08728,837,,14.0,0,39232$1396513
4,65052,1396237,38,0.079773,1085,,14.0,0,65052$1396237


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 9 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int64  
 1   goods_depersanalised  int64  
 2   sum_length            int64  
 3   attrs+title_score     float64
 4   offer_price           int64  
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int64  
 8   id                    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 172.9+ MB


Посмотрим долю пропусков.

In [5]:
train.isna().mean()

offer_depersanalised    0.000000
goods_depersanalised    0.000000
sum_length              0.000000
attrs+title_score       0.000000
offer_price             0.000000
goods_price             0.161722
goods_category_id       0.000331
target                  0.000000
id                      0.000000
dtype: float64

Посмотрим количество полных дубликатов и по ключам предложений и товаров маркетплейса.

In [6]:
sum(train.duplicated())

0

In [7]:
sum(train['offer_depersanalised'].duplicated())

2018441

In [8]:
sum(train['goods_depersanalised'].duplicated())

925834

In [9]:
sum(train['id'].duplicated())

18441

Посмотрим на тестовые данные, их пропуски и дубликаты.

In [10]:
test = pd.read_csv('/kaggle/input/data-match/test.csv')

In [11]:
test.head()

Unnamed: 0,offer_depersanalised,goods_depersanalised,sum_length,attrs+title_score,offer_price,goods_price,goods_category_id,id
0,64819,1396468,38,0.046997,368,,14.0,64819$1396468
1,64819,1396235,38,0.046997,368,,14.0,64819$1396235
2,64819,1396318,38,0.046997,368,,14.0,64819$1396318
3,359959,1396281,40,0.060211,634,,14.0,359959$1396281
4,142700,717657,40,0.00037,14924,31840.0,2.0,142700$717657


In [12]:
sum(test.duplicated())

0

In [13]:
sum(test.duplicated(['goods_depersanalised']))

46848

In [14]:
sum(test.duplicated(['offer_depersanalised']))

291068

Посмотрим на пример предсказания.

In [15]:
sample_submission = pd.read_csv('/kaggle/input/data-match/sample_submission.csv')

In [16]:
sample_submission

Unnamed: 0,id,target
0,64819$1396468,1
1,64819$1396235,1
2,64819$1396318,0
3,359959$1396281,0
4,142700$717657,0
...,...,...
363830,122775$310950,1
363831,419632$342465,0
363832,369393$130129,1
363833,24514$130142,0


Загрузим эмбеддинги изображений товаров маркетплейса.

In [17]:
goods_image_embed_deperson = np.load('/kaggle/input/data-match/goods_image_embed_deperson.npy')

In [18]:
goods_image_embed_deperson

array([[ 1.1158133 ,  1.9842914 ,  0.5167014 , ..., -0.69889563,
         0.11544477, -3.1679373 ],
       [ 0.99327075,  1.9756604 ,  0.23954017, ...,  0.00989665,
         0.49230903, -3.0051124 ],
       [-0.36902365, -2.316401  ,  1.3042173 , ..., -1.6195703 ,
         0.7884472 , -2.6850224 ],
       ...,
       [-0.16678151, -0.6597981 , -0.9382939 , ...,  0.3883793 ,
        -0.21414155,  0.41041985],
       [-0.16678151, -0.6597981 , -0.9382939 , ...,  0.3883793 ,
        -0.21414155,  0.41041985],
       [-0.16678151, -0.6597981 , -0.9382939 , ...,  0.3883793 ,
        -0.21414155,  0.41041985]], dtype=float32)

In [19]:
goods_image_embed_deperson.shape

(317707, 256)

Уменьшим размерность векторов с 256 признаков до 6.

In [20]:
truncater=TruncatedSVD(n_components=6)

goods_image_embed_deperson_trunc = truncater.fit_transform(goods_image_embed_deperson)

In [21]:
goods_image_embed_deperson_trunc.shape

(317707, 6)

Загрузим ключи векторов товаров маркетплейса.

In [22]:
goods_image_items_deperson = np.load('/kaggle/input/data-match/goods_image_items_deperson.npy')

In [23]:
goods_image_items_deperson

array(['37', '39', '49', ..., '1749527', '1749528', '1749541'],
      dtype='<U7')

In [24]:
goods_image_items_deperson.shape

(317707,)

Объединим координаты векторов и ключи векторов в массив.

In [25]:
goods_image_embed_items = np.column_stack((goods_image_items_deperson, goods_image_embed_deperson_trunc))

In [26]:
goods_image_embed_items.shape

(317707, 7)

Переведем массив в датафрейм. Составляющим векторов присвоим буквенные названия.

In [27]:
df_goods_image_embed_items = pd.DataFrame(goods_image_embed_items, columns=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

Изменим тип данных у ключей векторов для дальнейшего использования функции *merge*.

In [28]:
df_goods_image_embed_items['a']= pd.to_numeric(df_goods_image_embed_items['a'], downcast='integer')

In [29]:
df_goods_image_embed_items.head()

Unnamed: 0,a,b,c,d,e,f,g
0,37,2.4296458,2.3149526,-0.30108428,-0.22264354,0.43209946,-0.48809153
1,39,3.413238,1.9541973,0.46522567,0.5584674,1.2318193,-0.27816373
2,49,11.1171055,-5.0439005,-2.363701,-0.39086372,-5.428771,-1.1384416
3,52,7.663029,-4.389761,1.7605411,-4.533157,-0.17415957,2.6139426
4,67,6.42074,-4.1461554,-3.327963,-0.9286448,-0.56479514,0.98654366


Избавимся от полных дубликатов векторов.

In [30]:
sum(df_goods_image_embed_items.duplicated())

540

In [31]:
df_goods_image_embed_items = df_goods_image_embed_items.drop_duplicates().reset_index(drop=True)

In [32]:
sum(df_goods_image_embed_items.duplicated())

0

Избавимся от дубликатов в ключах векторов, не уверен в этом решении, хотелось бы комментарий ревьюра получить. Сделал так, по причине того, что в дальнейшем при объединении train и test с векторами, увеличивается число строк в датасетах, что критично для test, где нам нужно сохранить исходное число строк.

In [33]:
sum(df_goods_image_embed_items['a'].duplicated())

4613

In [34]:
df_goods_image_embed_items = df_goods_image_embed_items.drop_duplicates(subset='a')

In [35]:
sum(df_goods_image_embed_items['a'].duplicated())

0

Объединим train с векторами изображений товаров маркетплейса.

In [36]:
train = train.merge(df_goods_image_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['a'])

In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int64  
 1   goods_depersanalised  int64  
 2   sum_length            int64  
 3   attrs+title_score     float64
 4   offer_price           int64  
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int64  
 8   id                    object 
 9   a                     float64
 10  b                     object 
 11  c                     object 
 12  d                     object 
 13  e                     object 
 14  f                     object 
 15  g                     object 
dtypes: float64(4), int64(5), object(7)
memory usage: 307.4+ MB


Объединим test с векторами изображений товаров маркетплейса.

In [38]:
test = test.merge(df_goods_image_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['a'])

In [39]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363835 entries, 0 to 363834
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   offer_depersanalised  363835 non-null  int64  
 1   goods_depersanalised  363835 non-null  int64  
 2   sum_length            363835 non-null  int64  
 3   attrs+title_score     363835 non-null  float64
 4   offer_price           363835 non-null  int64  
 5   goods_price           304864 non-null  float64
 6   goods_category_id     363704 non-null  float64
 7   id                    363835 non-null  object 
 8   a                     95265 non-null   float64
 9   b                     95265 non-null   object 
 10  c                     95265 non-null   object 
 11  d                     95265 non-null   object 
 12  e                     95265 non-null   object 
 13  f                     95265 non-null   object 
 14  g                     95265 non-null   object 
dtype

Выполним те же преобразования с векторами атрибутов, описаний, изображений товаров с маркетплейса и предложений.

In [40]:
goods_title_embed_deperson = np.load('/kaggle/input/data-match/goods_title_embed_deperson.npy')

In [41]:
goods_title_embed_deperson.shape

(1760568, 64)

In [42]:
truncater=TruncatedSVD(n_components=4)

goods_title_embed_deperson_trunc = truncater.fit_transform(goods_title_embed_deperson)

In [43]:
goods_title_items_deperson = np.load('/kaggle/input/data-match/goods_title_items_deperson.npy')

In [44]:
goods_title_items_deperson

array(['0', '1', '2', ..., '1760565', '1760566', '1760567'], dtype='<U7')

In [45]:
goods_title_items_deperson.shape

(1760568,)

In [46]:
goods_title_embed_items = np.column_stack((goods_title_items_deperson, goods_title_embed_deperson_trunc))

In [47]:
goods_title_embed_items.shape

(1760568, 5)

In [48]:
goods_title_embed_items

array([['0', '-0.0064799343', '0.00063728966', '-0.110789',
        '-0.07558478'],
       ['1', '0.0391356', '-0.028748887', '-0.020673236', '0.05278282'],
       ['2', '-0.08320774', '0.11492841', '0.15391935', '0.06347214'],
       ...,
       ['1760565', '-0.089437425', '0.020813163', '0.007987914',
        '-0.10526249'],
       ['1760566', '0.08821404', '-0.11697758', '0.009047389',
        '-0.06367203'],
       ['1760567', '0.0041457033', '-0.06495126', '-0.009949893',
        '0.05550994']], dtype='<U32')

In [49]:
df_goods_title_embed_items = pd.DataFrame(goods_title_embed_items, columns=['h', 'i', 'j', 'k', 'l'])

In [50]:
df_goods_title_embed_items['h'] = pd.to_numeric(df_goods_title_embed_items['h'], downcast='integer')

In [51]:
df_goods_title_embed_items.head()

Unnamed: 0,h,i,j,k,l
0,0,-0.0064799343,0.00063728966,-0.110789,-0.07558478
1,1,0.0391356,-0.028748887,-0.020673236,0.05278282
2,2,-0.08320774,0.11492841,0.15391935,0.06347214
3,3,0.07353868,0.05434991,-0.03946342,0.012544699
4,4,-0.0022753733,0.037150793,-0.072646625,0.015425535


In [52]:
sum(df_goods_title_embed_items.duplicated())

0

In [53]:
sum(df_goods_title_embed_items['h'].duplicated())

0

In [54]:
train = train.merge(df_goods_title_embed_items, how ='left', left_on=['goods_depersanalised'], right_on=['h'])

In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int64  
 1   goods_depersanalised  int64  
 2   sum_length            int64  
 3   attrs+title_score     float64
 4   offer_price           int64  
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int64  
 8   id                    object 
 9   a                     float64
 10  b                     object 
 11  c                     object 
 12  d                     object 
 13  e                     object 
 14  f                     object 
 15  g                     object 
 16  h                     int32  
 17  i                     object 
 18  j                     object 
 19  k                     object 
 20  l                     object 
dtypes: float64(4), int32(1), int64(5), object(11)
memory usage: 393.9+ MB


In [56]:
test = test.merge(df_goods_title_embed_items, how ='left', left_on=['goods_depersanalised'], right_on=['h'])

In [57]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363835 entries, 0 to 363834
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   offer_depersanalised  363835 non-null  int64  
 1   goods_depersanalised  363835 non-null  int64  
 2   sum_length            363835 non-null  int64  
 3   attrs+title_score     363835 non-null  float64
 4   offer_price           363835 non-null  int64  
 5   goods_price           304864 non-null  float64
 6   goods_category_id     363704 non-null  float64
 7   id                    363835 non-null  object 
 8   a                     95265 non-null   float64
 9   b                     95265 non-null   object 
 10  c                     95265 non-null   object 
 11  d                     95265 non-null   object 
 12  e                     95265 non-null   object 
 13  f                     95265 non-null   object 
 14  g                     95265 non-null   object 
 15  

In [58]:
offer_image_embed_deperson = np.load('/kaggle/input/data-match/offer_image_embed_deperson.npy')

In [59]:
offer_image_embed_deperson.shape

(457586, 256)

In [60]:
truncater=TruncatedSVD(n_components=6)

offer_image_embed_deperson_trunc = truncater.fit_transform(offer_image_embed_deperson)

In [61]:
offer_image_embed_deperson_trunc.shape

(457586, 6)

In [62]:
offer_image_items_deperson = np.load('/kaggle/input/data-match/offer_image_items_deperson.npy')

In [63]:
offer_image_items_deperson.shape

(457586,)

In [64]:
offer_image_embed_items = np.column_stack((offer_image_items_deperson, offer_image_embed_deperson_trunc))

In [65]:
offer_image_embed_items.shape

(457586, 7)

In [66]:
df_offer_image_embed_items = pd.DataFrame(offer_image_embed_items, columns=['m', 'n', 'o', 'p', 'q','r','s'])

In [67]:
df_offer_image_embed_items['m'] = pd.to_numeric(df_offer_image_embed_items['m'], downcast='integer')

In [68]:
df_offer_image_embed_items.head()

Unnamed: 0,m,n,o,p,q,r,s
0,140,9.216668,-4.3278,-2.7471035,2.6643252,-3.0634468,-5.3629756
1,185,5.3209295,-4.2166405,-1.2048945,1.6544389,-1.533451,-3.1928148
2,187,1.1624435,-1.0486931,2.0026202,-0.76923347,1.2949291,-0.45382997
3,206,6.4263725,-4.528717,-1.8348314,2.942133,-1.8887752,-5.126713
4,242,11.58403,-1.737507,-1.147138,0.5426909,3.801049,1.3217763


In [69]:
sum(df_offer_image_embed_items.duplicated())

0

In [70]:
sum(df_offer_image_embed_items['m'].duplicated())

1

In [71]:
df_offer_image_embed_items = df_offer_image_embed_items.drop_duplicates(subset='m')

In [72]:
sum(df_offer_image_embed_items['m'].duplicated())

0

In [73]:
train = train.merge(df_offer_image_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['m'])

In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 28 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int64  
 1   goods_depersanalised  int64  
 2   sum_length            int64  
 3   attrs+title_score     float64
 4   offer_price           int64  
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int64  
 8   id                    object 
 9   a                     float64
 10  b                     object 
 11  c                     object 
 12  d                     object 
 13  e                     object 
 14  f                     object 
 15  g                     object 
 16  h                     int32  
 17  i                     object 
 18  j                     object 
 19  k                     object 
 20  l                     object 
 21  m                     float64
 22  n                     object 
 23  o      

In [75]:
test = test.merge(df_offer_image_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['m'])

In [76]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363835 entries, 0 to 363834
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   offer_depersanalised  363835 non-null  int64  
 1   goods_depersanalised  363835 non-null  int64  
 2   sum_length            363835 non-null  int64  
 3   attrs+title_score     363835 non-null  float64
 4   offer_price           363835 non-null  int64  
 5   goods_price           304864 non-null  float64
 6   goods_category_id     363704 non-null  float64
 7   id                    363835 non-null  object 
 8   a                     95265 non-null   float64
 9   b                     95265 non-null   object 
 10  c                     95265 non-null   object 
 11  d                     95265 non-null   object 
 12  e                     95265 non-null   object 
 13  f                     95265 non-null   object 
 14  g                     95265 non-null   object 
 15  

In [77]:
offer_title_embed_deperson = np.load('/kaggle/input/data-match/offer_title_embed_deperson.npy')

In [78]:
offer_title_embed_deperson.shape

(572767, 64)

In [79]:
truncater=TruncatedSVD(n_components=4)

offer_title_embed_deperson_trunc = truncater.fit_transform(offer_title_embed_deperson)

In [80]:
offer_title_items_deperson = np.load('/kaggle/input/data-match/offer_title_items_deperson.npy')

In [81]:
offer_title_items_deperson.shape

(572767,)

In [82]:
offer_title_embed_items = np.column_stack((offer_title_items_deperson, offer_title_embed_deperson_trunc))

In [83]:
offer_title_embed_items.shape

(572767, 5)

In [84]:
offer_title_embed_items

array([['477447', '-0.04602705', '0.013841896', '0.0030255322',
        '-0.0039220294'],
       ['95232', '0.15333539', '-0.089836225', '0.045917198',
        '-0.032234155'],
       ['117886', '0.018127974', '0.14837666', '0.08654299',
        '0.014025525'],
       ...,
       ['30926', '-0.026916072', '-0.13291276', '-0.018458905',
        '-0.117343344'],
       ['209577', '0.3481732', '-0.001877852', '0.11711394',
        '-0.11466495'],
       ['239103', '-0.102851555', '-0.026263151', '-0.119837575',
        '-0.015136348']], dtype='<U32')

In [85]:
df_offer_title_embed_items = pd.DataFrame(offer_title_embed_items, columns=['t', 'v', 'w', 'x', 'y'])

In [86]:
df_offer_title_embed_items['t'] = pd.to_numeric(df_offer_title_embed_items['t'], downcast='integer')

In [87]:
df_offer_title_embed_items.head()

Unnamed: 0,t,v,w,x,y
0,477447,-0.04602705,0.013841896,0.0030255322,-0.0039220294
1,95232,0.15333539,-0.089836225,0.045917198,-0.032234155
2,117886,0.018127974,0.14837666,0.08654299,0.014025525
3,218467,0.059300955,-0.005568855,0.12568802,0.0931678
4,399432,-0.055361673,0.08133987,0.03690753,0.04145802


In [88]:
sum(df_offer_title_embed_items.duplicated())

0

In [89]:
sum(df_offer_title_embed_items['t'].duplicated())

0

In [90]:
train = train.merge(df_offer_title_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['t'])

In [91]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 33 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int64  
 1   goods_depersanalised  int64  
 2   sum_length            int64  
 3   attrs+title_score     float64
 4   offer_price           int64  
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int64  
 8   id                    object 
 9   a                     float64
 10  b                     object 
 11  c                     object 
 12  d                     object 
 13  e                     object 
 14  f                     object 
 15  g                     object 
 16  h                     int32  
 17  i                     object 
 18  j                     object 
 19  k                     object 
 20  l                     object 
 21  m                     float64
 22  n                     object 
 23  o      

In [92]:
test = test.merge(df_offer_title_embed_items, how ='left', left_on=['offer_depersanalised'], right_on=['t'])

In [93]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363835 entries, 0 to 363834
Data columns (total 32 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   offer_depersanalised  363835 non-null  int64  
 1   goods_depersanalised  363835 non-null  int64  
 2   sum_length            363835 non-null  int64  
 3   attrs+title_score     363835 non-null  float64
 4   offer_price           363835 non-null  int64  
 5   goods_price           304864 non-null  float64
 6   goods_category_id     363704 non-null  float64
 7   id                    363835 non-null  object 
 8   a                     95265 non-null   float64
 9   b                     95265 non-null   object 
 10  c                     95265 non-null   object 
 11  d                     95265 non-null   object 
 12  e                     95265 non-null   object 
 13  f                     95265 non-null   object 
 14  g                     95265 non-null   object 
 15  

Датасеты объединены с векторами. Удалим колонки, дублирующие offer_depersanalised и goods_depersanalised, по которым присоединяли векторы.

In [94]:
train = train.drop(['a','h','m','t'], axis=1)

In [95]:
test = test.drop(['a','h','m','t'], axis=1)

Облегчим датафремы.

In [96]:
for column in ['attrs+title_score','offer_price','goods_price','goods_category_id','b', 'c', 'd', 'e', 'f', 'g', 'i', 'j', 'k', 'l','n', 'o', 'p', 'q','r','s','v', 'w', 'x', 'y']:
     train[column] = pd.to_numeric(train[column], downcast='float')
     test[column] = pd.to_numeric(test[column], downcast='float')

In [97]:
for column in ['offer_depersanalised','goods_depersanalised','sum_length']:
     train[column] = pd.to_numeric(train[column], downcast='integer')
     test[column] = pd.to_numeric(test[column], downcast='integer')

In [98]:
train['target'] = pd.to_numeric(train['target'], downcast='integer')

In [99]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518441 entries, 0 to 2518440
Data columns (total 29 columns):
 #   Column                Dtype  
---  ------                -----  
 0   offer_depersanalised  int32  
 1   goods_depersanalised  int32  
 2   sum_length            int16  
 3   attrs+title_score     float32
 4   offer_price           float64
 5   goods_price           float64
 6   goods_category_id     float64
 7   target                int8   
 8   id                    object 
 9   b                     float32
 10  c                     float32
 11  d                     float32
 12  e                     float32
 13  f                     float32
 14  g                     float32
 15  i                     float32
 16  j                     float32
 17  k                     float32
 18  l                     float32
 19  n                     float32
 20  o                     float32
 21  p                     float32
 22  q                     float32
 23  r      

In [100]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363835 entries, 0 to 363834
Data columns (total 28 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   offer_depersanalised  363835 non-null  int32  
 1   goods_depersanalised  363835 non-null  int32  
 2   sum_length            363835 non-null  int16  
 3   attrs+title_score     363835 non-null  float32
 4   offer_price           363835 non-null  float64
 5   goods_price           304864 non-null  float64
 6   goods_category_id     363704 non-null  float64
 7   id                    363835 non-null  object 
 8   b                     95265 non-null   float32
 9   c                     95265 non-null   float32
 10  d                     95265 non-null   float32
 11  e                     95265 non-null   float32
 12  f                     95265 non-null   float32
 13  g                     95265 non-null   float32
 14  i                     363835 non-null  float32
 15  

# Расчет Евклидова расстояния для пар предложений и товаров маркетплейса для изображений и описаний. 

Создадим вспомогательный датафрейм для изображений, в котором отсутствуют пропуски в векторах предложений и товаров маркетплейса.

In [101]:
train2= train.dropna(subset=['b', 'c', 'd', 'e', 'f', 'g','n', 'o', 'p', 'q', 'r','s'])

Аналогично для описаний.

In [102]:
train3= train.dropna(subset=['i', 'j', 'k', 'l','v', 'w', 'x', 'y'])

Аналогично для тестового датафрейма.

In [103]:
test2= test.dropna(subset=['b', 'c', 'd', 'e', 'f', 'g','n', 'o', 'p', 'q', 'r','s'])

In [104]:
test3= test.dropna(subset=['i', 'j', 'k', 'l','v', 'w', 'x', 'y'])

Создадим функции для расчета Евклидова расстояния для изображений.

In [105]:
def dist (row):
  vector_1 = np.array(row[['b', 'c', 'd', 'e', 'f', 'g',]])
  vector_2 = np.array(row[['n', 'o', 'p', 'q', 'r', 's',]])
  dist = distance.euclidean(vector_1, vector_2)
  return dist


Аналогично для описаний.

In [106]:
def dist_title (row):
  vector_1 = np.array(row[['i', 'j', 'k', 'l']])
  vector_2 = np.array(row[['v', 'w', 'x', 'y']])
  dist = distance.euclidean(vector_1, vector_2)
  return dist

Рассчитаем Евклидово расстояние для изображений и добавим во вспомогательный датафрейм.

In [107]:
%%time
train2['dist'] = train2.apply(dist, axis=1)

CPU times: user 7min 45s, sys: 454 ms, total: 7min 46s
Wall time: 7min 46s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Добавим расчет к обучающему датафрему.

In [108]:
train = train.merge(train2[['dist','offer_depersanalised','goods_depersanalised','id','goods_category_id']], how ='left', left_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'], right_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'])

Выполним расчет для описаний и добавим результат в обучающую выборку.

In [109]:
%%time
train3['dist_title'] = train3.apply(dist_title, axis=1)

CPU times: user 35min 41s, sys: 3.21 s, total: 35min 44s
Wall time: 35min 46s


In [110]:
train = train.merge(train3[['dist_title','offer_depersanalised','goods_depersanalised','id','goods_category_id']], how ='left', left_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'], right_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'])

In [111]:
train.shape

(2518441, 31)

In [112]:
train.head()

Unnamed: 0,offer_depersanalised,goods_depersanalised,sum_length,attrs+title_score,offer_price,goods_price,goods_category_id,target,id,b,...,p,q,r,s,v,w,x,y,dist,dist_title
0,295140,1396793,37,0.027267,1070.0,,14.0,0,295140$1396793,,...,4.576931,3.014192,0.955101,1.020673,0.113721,-0.040854,0.12771,-0.103195,,0.233474
1,65291,1396586,38,0.050415,698.0,,14.0,0,65291$1396586,,...,,,,,0.376264,0.171694,-0.008931,-0.125031,,0.458888
2,39232,1396244,38,0.08728,837.0,,14.0,0,39232$1396244,,...,,,,,0.132544,0.092304,-0.015373,0.034419,,0.208374
3,39232,1396513,38,0.08728,837.0,,14.0,0,39232$1396513,,...,,,,,0.132544,0.092304,-0.015373,0.034419,,0.202754
4,65052,1396237,38,0.079773,1085.0,,14.0,0,65052$1396237,,...,,,,,0.091361,-0.035699,0.125365,-0.092346,,0.143748


Выполним те же операции с тестовым датафреймом.

In [113]:
%%time
test2['dist'] = test2.apply(dist, axis=1)

CPU times: user 1min 6s, sys: 75.9 ms, total: 1min 6s
Wall time: 1min 6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [114]:
test = test.merge(test2[['dist','offer_depersanalised','goods_depersanalised','id','goods_category_id']], how ='left', left_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'], right_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'])

In [115]:
%%time
test3['dist_title'] = test3.apply(dist_title, axis=1)

CPU times: user 5min 14s, sys: 324 ms, total: 5min 14s
Wall time: 5min 14s


In [116]:
test = test.merge(test3[['dist_title','offer_depersanalised','goods_depersanalised','id','goods_category_id']], how ='left', left_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'], right_on=['offer_depersanalised','goods_depersanalised','id','goods_category_id'])

In [117]:
test.shape

(363835, 30)

In [118]:
test.head()

Unnamed: 0,offer_depersanalised,goods_depersanalised,sum_length,attrs+title_score,offer_price,goods_price,goods_category_id,id,b,c,...,p,q,r,s,v,w,x,y,dist,dist_title
0,64819,1396468,38,0.046997,368.0,,14.0,64819$1396468,,,...,,,,,0.046045,-0.059184,0.077182,-0.045185,,0.184268
1,64819,1396235,38,0.046997,368.0,,14.0,64819$1396235,,,...,,,,,0.046045,-0.059184,0.077182,-0.045185,,0.088493
2,64819,1396318,38,0.046997,368.0,,14.0,64819$1396318,,,...,,,,,0.046045,-0.059184,0.077182,-0.045185,,0.098728
3,359959,1396281,40,0.060211,634.0,,14.0,359959$1396281,6.765513,3.636262,...,0.171811,1.666916,-2.558295,-0.094631,0.242097,0.057789,-0.11801,0.071218,10.049448,0.32936
4,142700,717657,40,0.00037,14924.0,31840.0,2.0,142700$717657,,,...,-1.414108,4.128166,-5.841728,7.184959,0.259657,-0.179213,0.029306,-0.125027,,0.397381


Датафремы готовы. Для удобства работы Ml часть выполнена во второй части проекта.

In [119]:
train.to_csv('train_distance.csv', index= False)

In [120]:
test.to_csv('test_distance.csv', index= False)