In [1]:
# import packages

import numpy as np
import pandas as pd
import warnings
import gc
from operator import itemgetter

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

warnings.filterwarnings('ignore')    # ignore warning

In [2]:
# read train dataset

train = pd.read_csv('data/mercari-price-suggestion-train.tsv', sep='\t')
train = train[train['price'] > 0].reset_index(drop=True)

In [3]:
# split train dataset to train and validation dataset

cv = KFold(n_splits=20, shuffle=True, random_state=42)
train_ids, valid_ids = next(cv.split(train))
train, valid = train.iloc[train_ids], train.iloc[valid_ids]

In [4]:
# define preprocessing functions

def preprocess(df):
    df['name'] = df['name'].fillna('') + ' ' + df['brand_name'].fillna('')
    df['text'] = (df['item_description'].fillna('') + ' ' + df['name'] + ' ' + df['category_name'].fillna(''))
    return df[['name', 'text', 'shipping', 'item_condition_id']]

def on_field(f, *vec):
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

def to_records(df):
    return df.to_dict(orient='records')

In [5]:
# define vectorizing function

vectorizer = make_union(
    on_field('name', Tfidf(max_features=100000, token_pattern='\w+')),
    on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
    on_field(['shipping', 'item_condition_id'],
             FunctionTransformer(to_records, validate=False), DictVectorizer()),
    n_jobs=4)

In [6]:
# preprocess and vectorize both train and validation dataset

X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
print(f'X_train: {X_train.shape} of {X_train.dtype}')
print(f'X_valid: {X_valid.shape} of {X_valid.dtype}')

X_train: (1407577, 200002) of float32
X_valid: (74084, 200002) of float32


In [7]:
# log-transform target variable

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(np.log1p(train['price'].values.reshape(-1, 1)))

In [8]:
# delete raw train dataset to save memory

del train

In [9]:
# make binary datasets for both train and validation

Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]

In [10]:
# define datasets to be fit by the model: 2 for original, 2 for binary

nets = 4
models = [0] * nets
xs_train = [Xb_train, X_train] * 2
xs_valid = [Xb_valid, X_valid] * 2

In [11]:
# define model (Functional API), fit, and save

for i in range(nets):
    print("=" * 50)
    print("Model %d" % (i+1))
    print("=" * 50)
    
    train = xs_train[i]
    
    model_in = Input(shape=(train.shape[1],), dtype='float32', sparse=True)
    out = Dense(192, activation='relu')(model_in)
    out = Dense(64, activation='relu')(out)
    out = Dense(64, activation='relu')(out)
    out = Dense(1)(out)
    model = Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=3e-3))
    
    for j in range(3):
        model.fit(x=train, y=y_train, batch_size=2**(11 + j), epochs=1, verbose=1)
    
    models[i] = model
    print("Model %d saved.\n\n" % (i+1))

Model 1
Model 1 saved.


Model 2
Model 2 saved.


Model 3
Model 3 saved.


Model 4
Model 4 saved.




In [12]:
# ensemble prediction on validation dataset and print the error

y_valid = np.mean([models[i].predict(xs_valid[i])[:, 0] for i in range(nets)], axis=0)
y_valid = np.expm1(y_scaler.inverse_transform(y_valid.reshape(-1, 1))[:, 0])
print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['price'], y_valid))))

Valid RMSLE: 0.3873


In [13]:
# now, for the test dataset

# read data in 4 chunks
test_chunk = pd.read_csv('data/mercari-price-suggestion-test.tsv', sep='\t', chunksize=1000000)

# initialize index, prediction, and chunk number
test_ids = []
y_test = []
chunk_ids = 0

# iterate each chunk
for chunk in test_chunk:
    # free up some memory
    gc.collect()
    
    # current chunk
    chunk_ids += 1
    print('Processing chunk', chunk_ids)
    
    # get index
    test_ids.extend(chunk.test_id)
    
    # preprocess data
    X_test = vectorizer.transform(preprocess(chunk)).astype(np.float32)
    print(f'Test dataset chunk size: {X_test.shape} of {X_test.dtype} \n')
    Xb_test = X_test.astype(np.bool).astype(np.float32)
    xs_test = [Xb_test, X_test] * 2
    
    # predict
    y_test_chunk = np.mean([models[i].predict(xs_test[i])[:, 0] for i in range(nets)], axis=0)
    y_test_chunk = np.expm1(y_scaler.inverse_transform(y_test_chunk.reshape(-1, 1))[:, 0])
    y_test.extend(y_test_chunk)
    
# make submission
output = pd.DataFrame({'test_id':test_ids, 'price':y_test})
output.to_csv('submission.csv', index=False)

Processing chunk 1
Test dataset chunk size: (1000000, 200002) of float32 

Processing chunk 2
Test dataset chunk size: (1000000, 200002) of float32 

Processing chunk 3
Test dataset chunk size: (1000000, 200002) of float32 

Processing chunk 4
Test dataset chunk size: (460725, 200002) of float32 

