# Avito demand prediction challenge

Avito is one of Russia’s largest classified advertisements website.
The idea of this challenge is to predict the demand for a product.

<h3>Details:</h3>
<p>In their fourth Kaggle competition, Avito is challenging you to predict demand for an online advertisement based on its full description (title, description, images, etc.), its context (geographically where it was posted, similar ads already posted) and historical demand for similar ads in similar contexts. With this information, Avito can inform sellers on how to best optimize their listing and provide some indication of how much interest they should realistically expect to receive.
</p>
<h3>Link:</h3>
<p>Full description of the challenge is available here:</p>
<a href="https://www.kaggle.com/c/avito-demand-prediction/overview" target="_blank">https://www.kaggle.com/c/avito-demand-prediction/overview</a>


## Imports

In [36]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import time
# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# Gradient Boosting
import lightgbm as lgb
# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords
# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import string
import sys

In [37]:
try:
    ## Occasionally (dev purpose only)
    sys.path.insert(0, "../..")
    import aisimplekit
except ModuleNotFoundError as err:
    print("""[err] {err}""".format(err=err))
    print("""Try: `pip install aisimplekit`""")

In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
from aisimplekit.features.stats import *
from aisimplekit.utils.memory import reduce_mem_usage
from aisimplekit.cv.cv_kfold import cross_validate
import aisimplekit.features.tfidf as tfidf

## Helper: Loading data files

In [40]:
def load_data(gp, downsample_ratio=None, folder='../input/avito-demand-prediction'):
    """ """
    print('Loading train/test')
    train = pd.read_csv(folder+'/train.csv', index_col = "item_id", parse_dates = ["activation_date"])
    test = pd.read_csv(folder+'/test.csv', index_col = "item_id", parse_dates = ["activation_date"])
    train_index = train.index
    test_index = test.index

    if downsample_ratio is not None:
        print('Downsampling: %s' % downsample_ratio)
        assert downsample_ratio > 1.0
        from sklearn.utils import resample
        train = resample(train, replace=False, n_samples=int(len(train)/downsample_ratio), random_state=123)
        test = resample(test, replace=False, n_samples=int(len(test)/downsample_ratio), random_state=123)
        train_index = train.index
        test_index = test.index

    if gp is not None:
        print('Merging train/gp and test/gp')
        train = train.reset_index().merge(gp, on='user_id', how='left').set_index('item_id')
        test = test.reset_index().merge(gp, on='user_id', how='left').set_index('item_id')

    return (train, test, train_index, test_index)

## Helpers: Feature Extraction: User aggregated metrics

In [41]:
def compute_aggregated_metrics(folder='../input/avito-demand-prediction', save_features=False):
    """ Computes 3 features:
    - `avg_times_up_user` - how often the average item of the user has been put up for sale.
    - `avg_days_up_user` - the average number of days an item from the user has been put up for sale.
    - `n_user_items` - the number of items the user has put up for sale.
    """
    print('1/6 Loading data')
    used_cols = ['item_id', 'user_id', 'price']
    train = pd.read_csv(folder+'/train.csv', usecols=used_cols)
    train_active = pd.read_csv(folder+'/train_active.csv', usecols=used_cols)
    test = pd.read_csv(folder+'/test.csv', usecols=used_cols)
    test_active = pd.read_csv(folder+'/test_active.csv', usecols=used_cols)

    print('2.1/6 Building concatenated dataframe: all_samples')
    all_samples = pd.concat([train,train_active,test,test_active]).reset_index(drop=True)
    all_samples.drop_duplicates(['item_id'], inplace=True)
    del train_active; del test_active; gc.collect()

    print('2.2/Aggregating price by user')
    gp2 = None
    gp3 = None
    if False:
        gp2 = all_samples.groupby('user_id')['price'].mean()
        gp3 = all_samples.groupby('user_id')['price'].max()
    all_samples.drop(['price'], inplace=True, axis=1)
    
    ## concatenate the train and test period data to one dataframe for easier processing
    print('2.3/6 Loading/Building concatenated dataframe: all_periods')
    train_periods = pd.read_csv(folder+'/periods_train.csv', parse_dates=['date_from', 'date_to'])
    test_periods = pd.read_csv(folder+'/periods_test.csv', parse_dates=['date_from', 'date_to'])
    all_periods = pd.concat([train_periods,test_periods])
    del train_periods; del test_periods; gc.collect()

    ## Compute features
    print('3/6 Computing: days_up, days_up_sum, times_put_up')
    all_periods['days_up'] = all_periods['date_to'].dt.dayofyear - all_periods['date_from'].dt.dayofyear

    gp = all_periods.groupby(['item_id'])[['days_up']]
    gp_df = pd.DataFrame()
    gp_df['days_up_sum'] = gp.sum()['days_up']
    gp_df['times_put_up'] = gp.count()['days_up']
    gp_df.reset_index(inplace=True)
    gp_df.rename(index=str, columns={'index': 'item_id'})

    print('4/6 Merging')
    all_periods.drop_duplicates(['item_id'], inplace=True)
    all_periods = all_periods.merge(gp_df, on='item_id', how='left')
    del gp; del gp_df; gc.collect()

    ## We have an interesting but kind of useless feature now. As seen in the second venn diagram, there is no overlap at all between `train_active` (and with that `train_periods`) and `train` concerning *item* IDs.
    ## For the feature to become useful, we somehow have to associate an item ID with a user ID.
    all_periods = all_periods.merge(all_samples, on='item_id', how='left')

    print('5/6 Computing metric 1 and 2')
    ## Group items: Metrics 1/3 and 2/3
    gp = all_periods.groupby(['user_id'])[['days_up_sum', 'times_put_up']].mean().reset_index() \
        .rename(index=str, columns={'days_up_sum': 'avg_days_up_user', 'times_put_up': 'avg_times_up_user'})
    ## Metric 3/3
    ## For our last feature, `n_user_items`, we just group by user ID and count the number of items.
    ## We have to be careful to use `all_samples` instead of `all_periods` here because the latter does not contain the `train.csv` and `test.csv` samples.
    print('6/6 Computing metric 3')
    n_user_items = all_samples.groupby(['user_id'])[['item_id']].count().reset_index() \
        .rename(index=str, columns={'item_id': 'n_user_items'})
    gp = gp.merge(n_user_items, on='user_id', how='outer')

    ## Save the features
    if save_features is True:
        gp.to_csv('aggregated_features.csv', index=False)
    ## Cleanup
    del all_samples; del all_periods; del train; del test; gc.collect()
    return (gp, gp2, gp3)

## Helpers: Feature Extraction:
### User statistic features
### Category statistic features
### Statistics about geography counts, image counts.

In [42]:
def add_user_features(df):
    ## Q2 - How many posts per user, divided by posts from his (region or city) ?
    df = do_count(df, ['region', 'city'], 'X3', show_max=True);
    df = do_count(df, ['user_id'], 'X4', show_max=True)
    df['X5'] = df['X4']/df['X3']; df.drop(['X3','X4'],axis=1,inplace=True) # X5: 
    """ Q3 - How many posts having a description (or image or title or ...), divided by user's total posts ? """
    df = do_count(df, ['user_id'], 'X6', show_max=True)
    df = do_countuniq(df, ['user_id'], 'image_top_1', 'X7', show_max=True)
    df['X8'] = df['X7']/df['X6']; df.drop(['X7','X6'],axis=1,inplace=True) #

    cols = ['description_num_chars', 'description_num_words', 'description_num_unique_words', 'description_words_vs_unique',
            'title_num_chars', 'title_num_words', 'title_num_unique_words', 'title_words_vs_unique']
    for col in cols:
        df = do_mean(df, ['user_id'], col, 'mean_user_%s'%col, show_max=True)
        df = do_mean(df, ['category_name'], col, 'mean_category_%s'%col, show_max=True)
        df['ratio_mean_user-cat_%s'%col] = df['mean_user_%s'%col]/df['mean_category_%s'%col]
        df.drop(['mean_user_%s'%col, 'mean_category_%s'%col],axis=1,inplace=True)
    return df   ## Unique counts, Means; Variances, Min/max/Median ; Top-ranked (categorical); First/Last .. ; Previous/Next

def add_category_features(df):
    df['price_rank'] = df.groupby(['category_name'])['price'].rank(ascending=True) # 0.229789
    return df

def add_other_features(df):
    if True:
        df = do_count(df, ['region'], 'T0', show_max=True)
        df = do_count(df, ['city'], 'T1', show_max=True)
        df['T2'] = df['T1']/df['T0']; df.drop(['T0','T1'],axis=1,inplace=True)
    if True:
        df = do_count(df, ['image_top_1'], 'T5', show_max=True) # 0.229818
        df = do_count(df, ['image_top_1', 'category_name'], 'T8', show_max=True) # 0.229776
    return df

def add_stat_features(df):
    df = add_user_features(df)
    df = add_category_features(df)
    df = add_other_features(df)
    return df

## Common settings

In [43]:
gp = None
DEV = True
VALID = False # helps to find the best num_rounds !
n_rounds = 2401 # identified during validation, WARNING: set early_stopping_rounds to 50 !
#data_dir = '../input/avito-demand-prediction'
data_dir = '~/.kaggle/competitions/avito-demand-prediction/'

USE_IMAGE_FEATURES = False 

## Dev Settings

In [44]:
optimize_memory = False
max_features = 8000 # TF-IDF: Counts

## Dev settings
early_stopping_rounds = 15 # 50
learning_rate = 0.05 # 0.019
num_leaves = 128
downsample_ratio = 8.0

with_user_agg = False
with_tfidf = True

main_model = 'lgb' # 'lgb' or 'dnn'
analyzer = 'word' # 'word' or 'char'

## Submit Settings

In [45]:
if DEV is False:
    ## Same settings for VALID and SUBMIT
    early_stopping_rounds = 50
    learning_rate = 0.019
    num_leaves = 250
    downsample_ratio = None

    with_user_agg = True

    if main_model == 'dnn':
        with_tfidf = False
        
    optimize_memory = True
    max_features = None
    sub_filename = "lgb-base-2.csv"

## Loading (or computing): user aggregated features

In [46]:
%%time
gp = None
gp2 = None
gp3 = None
agg_cols = []

if gp is not None:
    print('Reusing gp.')
elif with_user_agg is True:
    (gp, gp2, gp3) = compute_aggregated_metrics(folder=data_dir, save_features=False)
    agg_cols = list(gp.columns)[1:]
else:
    print('Not using aggregated metrics.')

Not using aggregated metrics.
CPU times: user 407 µs, sys: 37 µs, total: 444 µs
Wall time: 356 µs


## Load Train/Test Data

In [47]:
%%time
(training, testing, traindex, testdex) = load_data(gp, downsample_ratio=downsample_ratio, folder=data_dir)

if gp is not None and optimize_memory is True:
    del(gp)
    gc.collect()

y = training.deal_probability.copy().clip(0.0, 1.0)
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))
print("Combining Train and Test")

df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('Final dataframe shape: {} Rows, {} Columns'.format(*df.shape))

Loading train/test
Downsampling: 8.0
Train shape: 187928 Rows, 16 Columns
Test shape: 63554 Rows, 16 Columns
Combining Train and Test
Final dataframe shape: 251482 Rows, 16 Columns
CPU times: user 35.1 s, sys: 5.28 s, total: 40.4 s
Wall time: 1min 1s


## Load pre-computed image features

In [48]:
# FIXME !
if USE_IMAGE_FEATURES is True:
    df_img = pd.read_csv('../input/trainimgfeatv2/train_imgfeat_v2.csv')
    df_img = df_img.rename(columns={'Unnamed: 0.1': 'image'})
    df_img = df_img.drop('Unnamed: 0', axis=1)
    df_img['image'] = df_img['image'].apply(lambda x: x.rstrip('.jpg'))
    df_img['image'] = df_img['image'].apply(str)
    df = df.reset_index().merge(df_img, on='image', how='left').set_index('item_id')
    for col in df_img.columns:
        df[col].fillna(-1.0, inplace=True)
    del(df_img)
    gc.collect()

## [1/4] Feature Engineering: Simple transformer features (np.log, dates) + encoding.

In [49]:
%%time
print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

print("Creating Time Variables..")
df["Weekday"] = df['activation_date'].dt.weekday
df["Weekd of Year"] = df['activation_date'].dt.week
df["Day of Month"] = df['activation_date'].dt.day

# Create Validation Index and Remove Dead Variables
# training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
# validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index
df.drop(["activation_date", "image"],axis=1,inplace=True)

print("Encoding categorical variables")
categorical = ["user_id", "region", "city", "parent_category_name", "category_name",
               "user_type", "image_top_1", "param_1", "param_2", "param_3"]
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))

Feature Engineering
Creating Time Variables..
Encoding categorical variables
CPU times: user 3.54 s, sys: 63.2 ms, total: 3.6 s
Wall time: 3.83 s


## [2/4] Feature Engineering: Simple Text Features

In [50]:
%%time
# Meta Text Features
textfeats = ["description", "title"]
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_desc_punct'] = df[cols].apply(lambda x: count(x, set(string.punctuation)))

for col in agg_cols:
    df[col].fillna(-1, inplace=True)

CPU times: user 15.7 s, sys: 25.2 ms, total: 15.7 s
Wall time: 16.1 s


## [3/4] Feature Engineering: Stats features

In [53]:
%%time
## Add statistical features
df = add_stat_features(df)

Aggregating by  ['region', 'city'] ...
X3 max value =  10743
Aggregating by  ['user_id'] ...
X4 max value =  188
Aggregating by  ['user_id'] ...
X6 max value =  188
Counting unique  image_top_1  by  ['user_id'] ...
X7 max value =  91
Calculating mean of  description_num_chars  by  ['user_id'] ...
mean_user_description_num_chars max value =  3144.0
Calculating mean of  description_num_chars  by  ['category_name'] ...
mean_category_description_num_chars max value =  810.3571428571429
Calculating mean of  description_num_words  by  ['user_id'] ...
mean_user_description_num_words max value =  597.0
Calculating mean of  description_num_words  by  ['category_name'] ...
mean_category_description_num_words max value =  113.14880952380952
Calculating mean of  description_num_unique_words  by  ['user_id'] ...
mean_user_description_num_unique_words max value =  333.0
Calculating mean of  description_num_unique_words  by  ['category_name'] ...
mean_category_description_num_unique_words max value =

## [4/4] Feature Engineering: TF-IDF Text Features

In [57]:
transformer_spec = {
    'description': {
        'vectorizer': tfidf.TRANSFORMER_TFIDF,
        'ngram_range': (1,2),
        'max_features': 17000,
        'kwargs': {} # overridable
    },
    'title': {
        'vectorizer': tfidf.TRANSFORMER_COUNT,
        'ngram_range': (1,2),
        'max_features': max_features,
        'kwargs': None # no additional named args
    }
}

In [58]:
%%time
(vectorizer, df_tfidf, tf_vocab) = tfidf.compute_features(df, transformer_spec, analyzer='word', stop="russian")
print('#vocab_size: %d' % len(tf_vocab))

#vocab_size: 25000
CPU times: user 1min 46s, sys: 1.53 s, total: 1min 47s
Wall time: 1min 49s


## Train Model: LGB

In [64]:
%%time
if main_model == 'lgb':
    # Drop Text Cols
    try:
        df.drop(textfeats, axis=1, inplace=True)
    except KeyError as e:
        print(e)

    # Reduce Memory (See function up top)
    df = reduce_mem_usage(df)

    # Combine Dense Features with Sparse Text Bag of Words Features
    print('Concatenating base features + tfidf features..')
    if df_tfidf is not None:
        X = hstack([csr_matrix(df.loc[traindex,:].values), df_tfidf[0:traindex.shape[0]]]) # Sparse Matrix
        testing = hstack([csr_matrix(df.loc[testdex,:].values), df_tfidf[traindex.shape[0]:]])
        predictors = df.columns.tolist() + tf_vocab
    else:
        X = hstack([csr_matrix(df.loc[traindex,:].values)]) # Sparse Matrix
        testing = hstack([csr_matrix(df.loc[testdex,:].values)])
        predictors = df.columns.tolist()

    del(df)
    del(df_tfidf)
    gc.collect()

    lgbm_params =  {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse', # 'max_depth': 15,
        'num_leaves': num_leaves,
        'feature_fraction': 0.50,
        'bagging_fraction': 0.70, # 'bagging_freq': 5,
        'learning_rate': learning_rate,
        'verbose': 0
    }

    # Training and Validation Set
    modelstart = time.time()
    print('Training..')
    if VALID == True:
        print('Mode: Dev with Cross-validation')
        ## CV: identify best tuning params, and features
        (scores, best_num_rounds) = cross_validate(
                                        X.tocsr(), y, folds=3, repeats=1, predictors=predictors,
                                        categorical=categorical, lgbm_params=lgbm_params,
                                        num_boost_rounds=n_rounds,
                                        early_stopping_rounds=early_stopping_rounds, verbose_eval=10
                                    )
        print(scores, best_num_rounds)
        print('Average best round: {}'.format(np.mean(best_num_rounds)))

    elif DEV == False:
        print('Mode: Submit')
        lgtrain = lgb.Dataset(X, y, feature_name=predictors, categorical_feature = categorical)
        del(X)
        gc.collect()

        lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=n_rounds, verbose_eval=40)
        lgpred = lgb_clf.predict(testing)

        lgsub = pd.DataFrame(lgpred,columns=["deal_probability"],index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1
        lgsub.to_csv(sub_filename,index=True,header=True)
    
    else:
        print('Mode: Dev without Cross-validation')
        ## DEV
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=23)
        lgtrain = lgb.Dataset(X_train, y_train, feature_name=predictors, categorical_feature = categorical)
        lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=predictors, categorical_feature = categorical)
        del(X)
        del(X_train)
        gc.collect()

        lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=n_rounds, valid_sets=[lgtrain, lgvalid], valid_names=['train','valid'],
            early_stopping_rounds=early_stopping_rounds, verbose_eval=10)

        print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
        del(X_valid)
        gc.collect()

"['description' 'title'] not found in axis"
Memory usage of dataframe is 28.95 MB
Memory usage after optimization is: 28.95 MB
Decreased by 0.0%
Concatenating base features + tfidf features..
Training..




Training until validation scores don't improve for 15 rounds.
[10]	train's rmse: 0.241313	valid's rmse: 0.241706
[20]	train's rmse: 0.230765	valid's rmse: 0.233258
[30]	train's rmse: 0.224898	valid's rmse: 0.229519
[40]	train's rmse: 0.220819	valid's rmse: 0.22749
[50]	train's rmse: 0.217482	valid's rmse: 0.226422
[60]	train's rmse: 0.214856	valid's rmse: 0.225744
[70]	train's rmse: 0.21229	valid's rmse: 0.22546
[80]	train's rmse: 0.210086	valid's rmse: 0.225192
[90]	train's rmse: 0.208102	valid's rmse: 0.225085
[100]	train's rmse: 0.205915	valid's rmse: 0.22502
[110]	train's rmse: 0.204102	valid's rmse: 0.224936
[120]	train's rmse: 0.20253	valid's rmse: 0.224905
[130]	train's rmse: 0.201089	valid's rmse: 0.224908
Early stopping, best iteration is:
[122]	train's rmse: 0.202285	valid's rmse: 0.224891
RMSE: 0.22489107097022176
CPU times: user 13min 47s, sys: 29.6 s, total: 14min 16s
Wall time: 6min 5s
