In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import wordbatch 

from datetime import datetime 
start_real = datetime.now()
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper, cross_val_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
# from keras.layers import Bidirectional
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from nltk.corpus import stopwords
import math
# set seed
np.random.seed(123)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Try wordbatch
# Try RNN
# Look at RMSLE Error Function
# https://www.kaggle.com/valkling/mercari-rnn-2ridge-models-with-notes-0-42755
# Combine both sets and do processing together. 

In [None]:
# Need to change to Log ? 

def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y )))

In [None]:
train_df = pd.read_csv('../input/train.tsv',sep='\t')
test_df = pd.read_csv('../input/test.tsv',sep='\t')

In [None]:
full_df = pd.concat([train_df,test_df])

In [None]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
full_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
full_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))


In [None]:
# split category name into 3 parts
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
full_df['subcat_0'], full_df['subcat_1'], full_df['subcat_2'] = \
zip(*full_df['category_name'].apply(lambda x: split_cat(x)))

# Filling missing values
def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df

print("Filling missing data...")
full_df = fill_missing_values(full_df)
print(full_df.category_name[1])

In [None]:
all_brands = set(full_df['brand_name'].values)

# Get missing brand name from name
premissing = len(full_df.loc[full_df['brand_name'] == 'missing'])
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand
full_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
found = premissing-len(full_df.loc[train_df['brand_name'] == 'missing'])
print(found)


In [None]:
all_brands

In [None]:
full_df.head(2)

In [None]:
print('number of brands', len(full_df.brand_name.unique()))
print('number of item condition', len(full_df.item_condition_id.unique()))
print('number of cat1', len(full_df.subcat_0.unique()))
print('number of cat2', len(full_df.subcat_1.unique()))
print('number of cat3', len(full_df.subcat_2.unique()))

In [None]:
full_df.brand_name.fillna(value="missing", inplace=True)
full_df["target"] = np.log1p(full_df.price)

In [None]:
nrow_test = len(train_df)

In [None]:
nm_tfidf = TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)
X_name = nm_tfidf.fit_transform(full_df['name'].values)

In [None]:
desc_tfidf = TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)
X_desc = desc_tfidf.fit_transform(full_df['item_description'].values)

In [None]:
wb = CountVectorizer()
X_category1 = wb.fit_transform(full_df['subcat_0'])
X_category2 = wb.fit_transform(full_df['subcat_1'])
X_category3 = wb.fit_transform(full_df['subcat_2'])
X_others = full_df[['shipping','item_condition_id','desc_len','name_len']].values

In [None]:
X_name.shape,X_desc.shape,X_category1.shape,X_category2.shape,X_category3.shape,X_others.shape
X_others[0],X_category1[0]

In [None]:
from scipy.sparse import csr_matrix, hstack, coo_matrix

# merge = np.hstack((X_name,X_desc,X_category1,X_category2,X_category3,X_others))
merge = hstack((X_name,X_desc,X_category1,X_category2,X_category3,csr_matrix(X_others))).tocsr()
X_train = merge[:nrow_test]
X_test = merge[nrow_test:]
# y_train = full_df['target'].values
y_train = full_df.iloc[:nrow_test,-1]

In [None]:
# ml_mapper = DataFrameMapper([
#     ('brand_name',LabelEncoder()),
#     ('subcat_0',LabelEncoder()),
#     ('subcat_1',LabelEncoder()),
#     ('subcat_2',LabelEncoder()),
#     ('desc_len',None),
#     ('name_len',None),
#     ('shipping',None),
#     ('item_description',TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)),
#     ('name',TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)),
#     ('train_id',None),
#     ('test_id',None),
#     ('target',None)
# ])

# processed_df = ml_mapper.fit_transform(full_df)
# mapper.transformed_names_

In [None]:
# Try Xgboost , Lightgbm ,wordbatch
import lightgbm as lgb
# http://lightgbm.readthedocs.io/en/latest/Python-API.html

d_train = lgb.Dataset(X_train, label=y_train)
params = {}
params['learning_rate'] = 0.03
params['data_random_seed'] = 1
params['objective'] = 'regression'
params['metric'] = 'RMSE'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10

# clf = lgb.train(params, d_train, 100)

clf = lgb.cv(params, d_train,num_boost_round=1000,early_stopping_rounds=20,verbose_eval=20,nfold=4 )

y_pred=clf.predict(x_test)

In [None]:
# print("Fitting Ridge model on training examples...")
# ridge_model = Ridge(
#     solver='auto', fit_intercept=True, alpha=1.0,
#     max_iter=100, normalize=False, tol=0.05, random_state = 1,
# )
# ridge_modelCV = RidgeCV(
#     fit_intercept=True, alphas=[5.0],
#     normalize=False, cv = 2, scoring='neg_mean_squared_error',
# )
# ridge_model.fit(X_train, Y_train)
# ridge_modelCV.fit(X_train, Y_train)

# Y_dev_preds_ridge = ridge_model.predict(X_dev)
# Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
# print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge))

# Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev)
# Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1)
# print("CV RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridgeCV))

**RNN******


In [None]:
print("Transforming text data to sequences...")
raw_text = np.hstack([full_df.item_description.str.lower(), full_df.name.str.lower(), full_df.category_name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print("   Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())
# full_df['seq_category'] = tok_raw.texts_to_sequences(full_df.category_name.str.lower())


In [None]:
rnn_mapper = DataFrameMapper([
    ('seq_name',None),
    ('seq_item_description',None),
    ('brand_name',LabelEncoder()),
    ('item_condition_id',None),
    ('shipping',None),
    ('subcat_0',LabelEncoder()),
    ('subcat_1',LabelEncoder()),
    ('subcat_2',LabelEncoder()),
    ('train_id',None),
    ('test_id',None),
    ('target',None)
],df_out=True)

rnn_df = rnn_mapper.fit_transform(full_df)

In [None]:
trainset = rnn_df[pd.isnull(rnn_df.test_id)]
del trainset['test_id']
del trainset['train_id']
testset = rnn_df[pd.isnull(rnn_df.train_id)]
del testset['train_id']
del testset['test_id']
del testset['target']

In [None]:
len(trainset),len(testset)
full_df.seq_name.max(),np.max(full_df.seq_name.max())
full_df.seq_item_description.apply(lambda x:len(x)).max()

In [None]:
MAX_NAME_SEQ = 17 #17
MAX_ITEM_DESC_SEQ = 100 #269
MAX_TEXT = np.max([
    np.max(rnn_df.seq_name.max()),
    np.max(rnn_df.seq_item_description.max()),
#     np.max(full_df.seq_category.max()),
]) + 100
MAX_BRAND = np.max(rnn_df.brand_name.max()) + 1
MAX_CONDITION = np.max(rnn_df.item_condition_id.max()) + 1
# MAX_DESC_LEN = np.max(rnn_df.desc_len.max()) + 1
# MAX_NAME_LEN = np.max(rnn_df.name_len.max()) + 1
MAX_SUBCAT_0 = np.max(rnn_df.subcat_0.max()) + 1
MAX_SUBCAT_1 = np.max(rnn_df.subcat_1.max()) + 1
MAX_SUBCAT_2 = np.max(rnn_df.subcat_2.max()) + 1

In [None]:
def get_rnn_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'brand_name': np.array(dataset.brand_name),
#         'category': np.array(dataset.category),
#         'category_name': pad_sequences(dataset.seq_category, maxlen=MAX_CATEGORY_SEQ),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset.shipping),
#         'desc_len': np.array(dataset[["desc_len"]]),
#         'name_len': np.array(dataset[["name_len"]]),
        'subcat_0': np.array(dataset.subcat_0),
        'subcat_1': np.array(dataset.subcat_1),
        'subcat_2': np.array(dataset.subcat_2),
    }
    return X


train,valid = train_test_split(trainset, random_state=123, train_size=0.8)
X_test = testset.values

X_train = get_rnn_data(train.iloc[:,0:-1].copy())
y_train = train.iloc[:,-1].values
# Y_train = train.target.values.reshape(-1, 1)

X_valid = get_rnn_data(valid.iloc[:,0:-1].copy())
y_valid = valid.iloc[:,-1].values
# Y_dev = dev.target.values.reshape(-1, 1)

X_test = get_rnn_data(testset)

In [None]:
# MAX_SUBCAT_0,MAX_SUBCAT_1,MAX_SUBCAT_2
# rnn_df.columns

In [None]:
# RNN

# set seed again in case testing models adjustments by looping next 2 blocks
np.random.seed(123)

def new_rnn_model(lr=0.001, decay=0.0):
    # Inputs
    name = Input(shape=(17,), name="name")
    item_desc = Input(shape=(100,), name="item_desc")
    brand_name = Input(shape=(1,), name="brand_name")
    item_condition = Input(shape=(1,), name="item_condition")
    num_vars = Input(shape=(1,), name="num_vars")
#     desc_len = Input(shape=[1], name="desc_len")
#     name_len = Input(shape=[1], name="name_len")
    subcat_0 = Input(shape=(1,), name="subcat_0")
    subcat_1 = Input(shape=(1,), name="subcat_1")
    subcat_2 = Input(shape=(1,), name="subcat_2")

    # Embeddings layers (adjust outputs to help model)
    emb_name = Embedding(MAX_TEXT, 20)(name)
    emb_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
#     emb_category_name = Embedding(MAX_TEXT, 20)(category_name)
#     emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
#     emb_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
#     emb_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    emb_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 10)(subcat_2)
    

    # rnn layers (GRUs are faster than LSTMs and speed is important here)
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
#     rnn_layer3 = GRU(8) (emb_category_name)

    # main layers
    main_l = concatenate([
        Flatten() (emb_brand_name)
#         , Flatten() (emb_category)
        , Flatten() (emb_item_condition)
#         , Flatten() (emb_desc_len)
#         , Flatten() (emb_name_len)
        , Flatten() (emb_subcat_0)
        , Flatten() (emb_subcat_1)
        , Flatten() (emb_subcat_2)
        , rnn_layer1
        , rnn_layer2
#         , rnn_layer3
        , num_vars
    ])
    # (incressing the nodes or adding layers does not effect the time quite as much as the rnn layers)
    main_l = Dropout(0.1)(Dense(512,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(256,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(128,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(64,kernel_initializer='normal',activation='relu') (main_l))

    # the output layer.
    output = Dense(1, activation="linear") (main_l)
    
    model = Model([name, item_desc, brand_name , item_condition, subcat_0, subcat_1, subcat_2], output)

    optimizer = Adam(lr=lr, decay=decay)
    # (mean squared error loss function works as well as custom functions)  
    model.compile(loss = 'mse', optimizer = optimizer)

    return model

model = new_rnn_model()
model.summary()
# del model


In [None]:
# Set hyper parameters for the model.
BATCH_SIZE = 512 * 3
epochs = 2

# Calculate learning rate decay.
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(train) / BATCH_SIZE) * epochs
lr_init, lr_fin = 0.005, 0.001
lr_decay = exp_decay(lr_init, lr_fin, steps)

# Create model and fit it with training dataset.
# rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)
rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)
rnn_model.fit(
        X_train, y_train, epochs=epochs, batch_size=BATCH_SIZE,
        validation_data=(X_valid, y_valid), verbose=1,
)

In [None]:
preds = rnn_model.predict(X_test)


In [None]:
preds_df = pd.DataFrame(preds)
preds_df.reset_index(inplace=True)
preds_df.columns = ['test_id','price']

In [None]:
preds_df['price'] = preds_df.price.apply(lambda x:np.expm1(x))
preds_df.to_csv('rnn_submit_1.csv',index=False)

In [None]:
# https://github.com/anttttti/Wordbatch