In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gc
from tqdm import tqdm
import string
import os

%matplotlib inline

In [11]:
id_cols = ['item_id', 'user_id']

data_root = os.path.join(os.getcwd(), 'datasets')

loading data

In [29]:
train = pd.read_csv(os.path.join(data_root, 'train.csv'), usecols=id_cols)

In [13]:
train_active = pd.read_csv(os.path.join(data_root, 'train_active.csv'), usecols=id_cols)

In [30]:
test = pd.read_csv(os.path.join(data_root, 'test.csv'), usecols=id_cols)

In [15]:
test_active = pd.read_csv(os.path.join(data_root, 'test_active.csv'), usecols=id_cols)

In [16]:
train_periods = pd.read_csv(os.path.join(data_root, 'periods_train.csv'), 
                            parse_dates=['date_from', 'date_to'])
test_periods = pd.read_csv(os.path.join(data_root, 'periods_test.csv'), 
                           parse_dates=['date_from', 'date_to'])

Combining datasets

In [17]:
all_samples = pd.concat([
    train,
    train_active,
    test,
    test_active
]).reset_index(drop=True)

all_samples.drop_duplicates(['item_id'], inplace=True)

del train_active
del test_active
gc.collect()

158

In [18]:
all_periods = pd.concat([
    train_periods,
    test_periods
])

del train_periods
del test_periods
gc.collect()

all_periods.head()

Unnamed: 0,item_id,activation_date,date_from,date_to
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28


Calculate total days an ad item up

In [19]:
all_periods['days_up'] = all_periods['date_to'].dt.dayofyear - all_periods['date_from'].dt.dayofyear

Calculating avgerage times a user puts up their ads and their average posting length

In [20]:
gp = all_periods.groupby(['item_id'])[['days_up']]

gp_df = pd.DataFrame()
gp_df['days_up_sum'] = gp.sum()['days_up']
gp_df['times_put_up'] = gp.count()['days_up']
gp_df.reset_index(inplace=True)
gp_df.rename(index=str, columns={'index': 'item_id'})

gp_df.head()

Unnamed: 0,item_id,days_up_sum,times_put_up
0,00000077ff21,13,1
1,000002c54018,6,1
2,000005570503,1,1
3,0000060018e6,6,1
4,000006497719,19,2


In [21]:
all_periods.drop_duplicates(['item_id'], inplace=True)
all_periods = all_periods.merge(gp_df, on='item_id', how='left')
all_periods.head()

Unnamed: 0,item_id,activation_date,date_from,date_to,days_up,days_up_sum,times_put_up
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,1,17,4
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,3,18,3
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,13,19,2
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,9,17,4
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,13,18,3


In [22]:
all_periods = all_periods.merge(all_samples, on='item_id', how='left')
all_periods.head()

Unnamed: 0,item_id,activation_date,date_from,date_to,days_up,days_up_sum,times_put_up,user_id
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,1,17,4,e292cce69842
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,3,18,3,a326c04a24ec
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,13,19,2,06d275498a56
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,9,17,4,831c8c4a622c
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,13,18,3,248102e50d79


In [23]:
gp = all_periods.groupby(['user_id'])[['days_up_sum', 'times_put_up']].mean().reset_index() \
    .rename(index=str, columns={
        'days_up_sum': 'avg_days_up_user',
        'times_put_up': 'avg_times_up_user'
    })
gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user
0,00000077ff21,12.5,2.0
1,000006497719,19.0,2.0
2,00000b4d72f6,3.0,1.0
3,00000d642d7e,13.0,1.0
4,0000126b80a4,12.0,1.75


Calculating number of items a user has put up for sale

In [24]:
n_user_items = all_samples.groupby(['user_id'])[['item_id']].count().reset_index() \
    .rename(index=str, columns={
        'item_id': 'n_user_items'
    })
gp = gp.merge(n_user_items, on='user_id', how='outer')

gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,n_user_items
0,00000077ff21,12.5,2.0,2
1,000006497719,19.0,2.0,1
2,00000b4d72f6,3.0,1.0,1
3,00000d642d7e,13.0,1.0,2
4,0000126b80a4,12.0,1.75,8


In [33]:
gp.to_csv(os.path.join(data_root, 'engineered_features.csv'), index=False)