In [20]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import make_column_transformer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, ConcatDataset
from torch import optim
from torch.optim.lr_scheduler import StepLR
from torchvision import transforms
from torchvision.models import resnet18


import nltk
from nltk.corpus import stopwords

import os

In [21]:
data_root = os.path.join(os.getcwd(), 'datasets')

In [22]:
engineered_features = pd.read_csv(os.path.join(data_root, 'engineered_features.csv'))
train = pd.read_csv(os.path.join(data_root, 'train.csv'))
test = pd.read_csv(os.path.join(data_root, 'test.csv'))

In [23]:
train = train.merge(engineered_features, on='user_id', how='left')
test = test.merge(engineered_features, on='user_id', how='left')

In [24]:
agg_cols = list(engineered_features.columns)[1:]

In [25]:
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

In [26]:
train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability,avg_days_up_user,avg_times_up_user,n_user_items
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),...,400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789,8.0,2.0,2
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,...,3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0,,,1
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,...,4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177,4.428571,1.142857,9
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,...,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323,16.714286,2.642857,32
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",...,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797,,,1


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1503424 entries, 0 to 1503423
Data columns (total 28 columns):
item_id                         1503424 non-null object
user_id                         1503424 non-null object
region                          1503424 non-null object
city                            1503424 non-null object
parent_category_name            1503424 non-null object
category_name                   1503424 non-null object
param_1                         1503424 non-null object
param_2                         1503424 non-null object
param_3                         1503424 non-null object
title                           1503424 non-null object
description                     1503424 non-null object
price                           1418062 non-null float64
item_seq_number                 1503424 non-null int64
activation_date                 1503424 non-null object
user_type                       1503424 non-null object
image                           1390836 non-nul

In [30]:
for df in [train, test]:
    print(df.shape)
    # Fill missing values
    df['description'] = df['description'].fillna('unknowndescription').to_string()
    df['title'] = df['title'].fillna('unknowntitle').to_string()
    
    df['price'].fillna(df['price'].mean(), inplace=True)
    df['image'].fillna('noimage', inplace=True)
    
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)
        
    for col in categorical:
        df[col].fillna('', inplace=True)
    
    # Engineer weekday feature
    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day.fillna(0)
    
    # Count number of words and unique words in text fields
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split()))
        df['num_unique_words_' + col] = df[col].apply(lambda comment: len(set(w for w in comment.split())))
    
    # Compute ratio  of words to unique words
    df['words_vs_unique_title'] = df['num_unique_words_title'] / df['num_words_title'] * 100
    df['words_vs_unique_description'] = df['num_unique_words_description'] / df['num_words_description'] * 100
    
    trans = make_column_transformer((TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                        lowercase=True, ngram_range=(1, 2),
                                        max_features=15000), ['title', 'description']),
                                    (OneHotEncoder(), categorical),
                                   remainder='passthrough')
    trans.fit_transform(df)
    print(df.shape)

(1503424, 28)


KeyboardInterrupt: 

In [None]:
target = 'deal_probability'