In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import time
from datetime import datetime 
start_real = datetime.now()
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper, cross_val_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
# from keras.layers import Bidirectional
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
# from nltk.corpus import stopwords
import math
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline  

# set seed
np.random.seed(123)


Using TensorFlow backend.


In [2]:
# Need to change to Log ? 

def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y )))

In [3]:
train_df = pd.read_csv('train.tsv',sep='\t')
test_df = pd.read_csv('test.tsv',sep='\t')

In [4]:
full_df = pd.concat([train_df,test_df])

In [5]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
full_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
full_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))


In [6]:
# split category name into 3 parts
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
full_df['subcat_0'], full_df['subcat_1'], full_df['subcat_2'] = \
zip(*full_df['category_name'].apply(lambda x: split_cat(x)))

# Filling missing values
def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df

print("Filling missing data...")
full_df = fill_missing_values(full_df)
print(full_df.category_name[1])

Filling missing data...
1    Electronics/Computers & Tablets/Components & P...
1              Other/Office supplies/Shipping Supplies
Name: category_name, dtype: object


In [7]:
all_brands = set(full_df['brand_name'].values)

# Get missing brand name from name
premissing = len(full_df.loc[full_df['brand_name'] == 'missing'])
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand
full_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
found = premissing-len(full_df.loc[train_df['brand_name'] == 'missing'])
print(found)


928207


In [8]:
print('number of brands', len(full_df.brand_name.unique()))
print('number of item condition', len(full_df.item_condition_id.unique()))
print('number of cat1', len(full_df.subcat_0.unique()))
print('number of cat2', len(full_df.subcat_1.unique()))
print('number of cat3', len(full_df.subcat_2.unique()))

number of brands 4823
number of item condition 5
number of cat1 11
number of cat2 114
number of cat3 883


In [9]:
full_df.brand_name.fillna(value="missing", inplace=True)
full_df['shipping'] = full_df['shipping'].astype('int')
full_df["target"] = np.log1p(full_df.price)

**RNN******


In [10]:
print("Transforming text data to sequences...")
raw_text = np.hstack([full_df.item_description.str.lower(), full_df.name.str.lower(), full_df.category_name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print("   Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())
# full_df['seq_category'] = tok_raw.texts_to_sequences(full_df.category_name.str.lower())


Transforming text data to sequences...
   Fitting tokenizer...
   Transforming text to sequences...


In [11]:
full_df.columns

Index(['brand_name', 'category_name', 'item_condition_id', 'item_description',
       'name', 'price', 'shipping', 'test_id', 'train_id', 'desc_len',
       'name_len', 'subcat_0', 'subcat_1', 'subcat_2', 'target',
       'seq_item_description', 'seq_name'],
      dtype='object')

In [12]:
# Steps
# Update rnn_mapper
# update get_rnn_data
# Add input to model and also the final model = Model(input,outp)

In [13]:
rnn_mapper = DataFrameMapper([
    ('seq_name',None),
    ('seq_item_description',None),
    ('brand_name',LabelEncoder()),
    ('item_condition_id',None),
    ('desc_len',None),
    ('name_len',None),
    ('shipping',None),
    ('subcat_0',LabelEncoder()),
    ('subcat_1',LabelEncoder()),
    ('subcat_2',LabelEncoder()),
    ('train_id',None),
    ('test_id',None),
    ('target',None)
],df_out=True)

rnn_df = rnn_mapper.fit_transform(full_df)

In [14]:
trainset = rnn_df[pd.isnull(rnn_df.test_id)].copy()
del trainset['test_id']
del trainset['train_id']
testset = rnn_df[pd.isnull(rnn_df.train_id)].copy()
del testset['train_id']
del testset['test_id']
del testset['target']

In [15]:
MAX_NAME_SEQ = 17 #17
MAX_ITEM_DESC_SEQ = 269 #269
MAX_TEXT = np.max([
    np.max(rnn_df.seq_name.max()),
    np.max(rnn_df.seq_item_description.max()),
#     np.max(full_df.seq_category.max()),
]) + 100
MAX_BRAND = np.max(rnn_df.brand_name.max()) + 1
MAX_CONDITION = np.max(rnn_df.item_condition_id.max()) + 1
# MAX_DESC_LEN = np.max(rnn_df.desc_len.max()) + 1
# MAX_NAME_LEN = np.max(rnn_df.name_len.max()) + 1
MAX_SUBCAT_0 = np.max(rnn_df.subcat_0.max()) + 1
MAX_SUBCAT_1 = np.max(rnn_df.subcat_1.max()) + 1
MAX_SUBCAT_2 = np.max(rnn_df.subcat_2.max()) + 1

In [16]:
def get_rnn_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'brand_name': np.array(dataset.brand_name),
#         'category': np.array(dataset.category),
#         'category_name': pad_sequences(dataset.seq_category, maxlen=MAX_CATEGORY_SEQ),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset.shipping),
        'desc_len': np.array(dataset.desc_len),
        'name_len': np.array(dataset.name_len),
        'subcat_0': np.array(dataset.subcat_0),
        'subcat_1': np.array(dataset.subcat_1),
        'subcat_2': np.array(dataset.subcat_2),
    }
    return X


train,valid = train_test_split(trainset, random_state=123, train_size=0.8)
X_test = testset.values

X_train = get_rnn_data(train.iloc[:,0:-1].copy())
y_train = train.iloc[:,-1].values
# Y_train = train.target.values.reshape(-1, 1)

X_valid = get_rnn_data(valid.iloc[:,0:-1].copy())
y_valid = valid.iloc[:,-1].values
# Y_dev = dev.target.values.reshape(-1, 1)

X_test = get_rnn_data(testset)

In [17]:
# NEWLY ADDED

def rmsle_K(y, y0):
    return K.sqrt(K.mean(K.square(tf.log1p(y) - tf.log1p(y0))))

In [18]:
# RNN

# set seed again in case testing models adjustments by looping next 2 blocks
np.random.seed(123)

def new_rnn_model(lr=0.001, decay=0.0,dropout=0.3):
    # Inputs
    name = Input(shape=(17,), name="name")
    item_desc = Input(shape=(269,), name="item_desc")
    brand_name = Input(shape=(1,), name="brand_name")
    item_condition = Input(shape=(1,), name="item_condition")
    num_vars = Input(shape=(1,), name="num_vars")
    desc_len = Input(shape=[1], name="desc_len")
    name_len = Input(shape=[1], name="name_len")
    subcat_0 = Input(shape=(1,), name="subcat_0")
    subcat_1 = Input(shape=(1,), name="subcat_1")
    subcat_2 = Input(shape=(1,), name="subcat_2")

    # Embeddings layers (adjust outputs to help model)
    emb_name = Embedding(MAX_TEXT, 20)(name)
    emb_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
#     emb_category_name = Embedding(MAX_TEXT, 20)(category_name)
#     emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
#     emb_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
#     emb_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    emb_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 10)(subcat_2)
    

    # rnn layers (GRUs are faster than LSTMs and speed is important here)
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
#     rnn_layer3 = GRU(8) (emb_category_name)

    # CNN Layer
#     cnn_layer1 = Conv1D(filters=16, kernel_size=3, activation='relu') (emb_item_desc)
#     cnn_layer1 = GlobalMaxPooling1D()(cnn_layer1)

#     cnn_layer2 = Conv1D(filters=8, kernel_size=3, activation='relu')(emb_name)
#     cnn_layer2 = GlobalMaxPooling1D()(cnn_layer2)

# Another option of CNN Layer
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)  # global max pooling
# x = Flatten()(x)
# x = Dense(128, activation='relu')(x)

    # main layers
    main_l = concatenate([
        Flatten() (emb_brand_name)
#         , Flatten() (emb_category)
        , Flatten() (emb_item_condition)
#         , Flatten() (emb_desc_len)
#         , Flatten() (emb_name_len)
        , Flatten() (emb_subcat_0)
        , Flatten() (emb_subcat_1)
        , Flatten() (emb_subcat_2)
        , rnn_layer1
        , rnn_layer2
#         , rnn_layer3
        , num_vars
    ])
    # (incressing the nodes or adding layers does not effect the time quite as much as the rnn layers)
    main_l = Dropout(dropout)(Dense(512,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(dropout)(Dense(256,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(dropout)(Dense(128,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(dropout)(Dense(64,kernel_initializer='normal',activation='relu') (main_l))

    # the output layer.
    output = Dense(1, activation="linear") (main_l)
    
    model = Model([name, item_desc, brand_name , item_condition,num_vars,desc_len,name_len
                   , subcat_0, subcat_1, subcat_2], output)

    optimizer = Adam(lr=lr, decay=decay)
    # (mean squared error loss function works as well as custom functions)  
    model.compile(loss = 'mse' , optimizer = optimizer, metrics=[rmsle_K])

    return model

# model = new_rnn_model()
# model.summary()
# del model


In [27]:
track = pd.DataFrame(columns=['p_lr','p_epoch','p_batch','p_drop','loss','val_loss','time'])

In [74]:
p_lr = [0.005]
p_epochs = [3]
p_batch = [512,2048]
p_drop = [0.1,0.3]

In [75]:
from itertools import product

params = product(p_lr,p_epoch,p_batch,p_drop)

In [35]:
# len(list(params))
# params = [(0.005,3,512,0.1),
#           (0.01,3,512,0.1),
#           (0.007,3,512,0.1)]
          

In [None]:
# Set hyper parameters for the model.
# p_lr = 0.005
# p_batch = 512 * 3
# p_epochs = 3
# p_drop = 0.3

for i in params:
    print(i)
    p_lr, p_epochs,p_batch,p_drop = i

    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(train) / p_batch) * p_epochs
    lr_init, lr_fin = p_lr, 0.001
    lr_decay = exp_decay(lr_init, lr_fin, steps)

    # Create model and fit it with training dataset.
    # rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)
    rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)

    start_time = time.time()

    history =  rnn_model.fit(
            X_train, y_train, epochs=p_epochs, batch_size=p_batch,
            validation_data=(X_valid, y_valid), verbose=1,
    )

    run_time = time.time() - start_time
    print("--- %s seconds ---" % run_time)
    # Track
    track.loc[len(track)]=[p_lr,p_epochs,p_batch,p_drop,history.history['loss'][-1],history.history['val_loss'][-1],run_time] 


    print(history.history.keys())
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

(0.005, 3, 512, 0.1)
Train on 1186028 samples, validate on 296507 samples
Epoch 1/3

In [40]:
track.to_csv('results.txt')

In [41]:
track

Unnamed: 0,p_lr,p_epoch,p_batch,p_drop,loss,val_loss,time
0,0.005,3.0,512.0,0.1,0.166646,0.196117,1364.526331
1,0.01,3.0,512.0,0.1,0.170712,0.193605,1355.465158
2,0.007,3.0,512.0,0.1,0.1685,0.19208,1355.928638


In [None]:
# Try using msle vs mse as metrics

# Dropout = 0.3 , Epoch = 3 , Init = 0.005 => 0.001 , BATCH SIZE = 1536
# Loss = 0.22 ,  Val_loss = 0.1966

# Dropout = 0.3 , Epoch = 5 , Init = 0.005 => 0.001 , BATCH SIZE = 1536
# Loss = 0.1607 ,  Val_loss = 0.1964

# Dropout = 0.5 , Epoch = 3 , Init = 0.005 => 0.001 , BATCH SIZE = 1536
# Loss = 0.2497 ,  Val_loss = 0.1977



In [None]:
preds = rnn_model.predict(X_test)


In [None]:
preds_df = pd.DataFrame(preds)
preds_df.reset_index(inplace=True)
preds_df.columns = ['test_id','price']

In [None]:
preds_df['price'] = preds_df.price.apply(lambda x:np.expm1(x))
preds_df.to_csv('rnn_submit_2.csv',index=False)

In [None]:
# https://github.com/anttttti/Wordbatch

In [None]:
from hyperas.distributions import uniform