In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import re
import time

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
import os

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
start_time = time.time()

In [None]:
train=pd.read_csv('../input/train.tsv', sep='\t', encoding='utf-8')
test=pd.read_csv('../input/test.tsv', sep='\t', encoding='utf-8')
sample = pd.read_csv('../input/sample_submission.csv', sep='\t', encoding='utf-8')

In [None]:
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [None]:
# price to logprice + 1
train["logprice"] = np.log(train["price"]+1)

# Push "Other/Other/Other" into NaN category name
train.loc[train["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others
test.loc[test["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others

# make 1st / 2nd level category label
train["1st_category"] = train["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
train["2nd_category"] = train["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')
test["1st_category"] = test["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
test["2nd_category"] = test["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')

In [None]:
feature_columns_to_use = ['item_condition_id','2nd_category', 'brand_name','shipping']
nonnumeric_columns = ['2nd_category', 'brand_name']

In [None]:
# combine for NaN filling
big_X = train[feature_columns_to_use].append(test[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [None]:
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

big_X_imputed.head(3)

In [None]:
big_X_imputed.__class__ # pandas.core.frame.DataFrame

trainX = big_X_imputed[0:train.shape[0]]
testX = big_X_imputed[train.shape[0]::]

trainX["price"] = train["price"]

trainX, validX = np.split(trainX.sample(frac=1), [int(.75*trainX.shape[0])])
c_ignors = ['price', 'train']
col = [c for c in trainX.columns if c not in c_ignors]

In [None]:
dtrain = xgb.DMatrix(trainX[col], trainX['price'])
dvalid  = xgb.DMatrix(validX[col],  validX['price'])

In [None]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
params = {'min_child_weight': 20, 'eta': 0.015, 'colsample_bytree': 0.48, 'max_depth': 14,
            'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'hist'}
model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=10, early_stopping_rounds=20)


In [None]:
#submission = pd.DataFrame({ 'test_id': test['test_id'], 'price': predictions })
#submission.to_csv("submission.csv", index=False)

In [None]:
test['price'] = model.predict(xgb.DMatrix(testX[col]), ntree_limit=model.best_ntree_limit)
test.loc[test['price'] < 0, 'price'] = 0
test['test_id'] = test['test_id'].astype(int)
test[['test_id', 'price']].to_csv("output.csv", index = False)
print("Finished ...")
tt = (time.time() - start_time)/60
print("Total time %s min" % tt)