In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import wordbatch 
from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

import time
from datetime import datetime 
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper, cross_val_score

from nltk.corpus import stopwords
import math
from scipy.sparse import csr_matrix, hstack, coo_matrix

# set seed
np.random.seed(123)
start_time = time.time()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Try wordbatch
# Try RNN
# Look at RMSLE Error Function
# https://www.kaggle.com/valkling/mercari-rnn-2ridge-models-with-notes-0-42755
# Combine both sets and do processing together. 



sample_submission.csv
test.tsv
train.tsv



In [2]:
train_df = pd.read_csv('../input/train.tsv',sep='\t')
test_df = pd.read_csv('../input/test.tsv',sep='\t')

In [3]:
# Use to split the sets later
nrow_test = len(train_df)

In [4]:
full_df = pd.concat([train_df,test_df])

In [5]:
full_df.head(2)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0


In [6]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
full_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
full_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))


In [7]:
# split category name into 3 parts
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
full_df['subcat_0'], full_df['subcat_1'], full_df['subcat_2'] = \
zip(*full_df['category_name'].apply(lambda x: split_cat(x)))

# Filling missing values
def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df

print("Filling missing data...")
full_df = fill_missing_values(full_df)
print(full_df.category_name[1])

Filling missing data...
1    Electronics/Computers & Tablets/Components & P...
1              Other/Office supplies/Shipping Supplies
Name: category_name, dtype: object


In [8]:
all_brands = set(full_df['brand_name'].values)

# Get missing brand name from name
premissing = len(full_df.loc[full_df['brand_name'] == 'missing'])
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand
full_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
found = premissing-len(full_df.loc[train_df['brand_name'] == 'missing'])
print(found)


928207


In [9]:
print('number of brands', len(full_df.brand_name.unique()))
print('number of item condition', len(full_df.item_condition_id.unique()))
print('number of cat1', len(full_df.subcat_0.unique()))
print('number of cat2', len(full_df.subcat_1.unique()))
print('number of cat3', len(full_df.subcat_2.unique()))

number of brands 4823
number of item condition 5
number of cat1 11
number of cat2 114
number of cat3 883


In [10]:
full_df.brand_name.fillna(value="missing", inplace=True)
full_df["target"] = np.log1p(full_df.price)

In [11]:
def normalize_text(text):
    return u" ".join(
        [y for y in  text.lower().strip().split(" ") if len(y) > 1])

In [12]:
# nm_tfidf = TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)
# X_name = nm_tfidf.fit_transform(full_df['name'].values)
wb_nm = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],"hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,"idf": None}), procs=1)
wb_nm.dictionary_freeze= True
X_name = wb_nm.fit_transform(full_df['name'])

# desc_tfidf = TfidfVectorizer(ngram_range=(1, 3),lowercase=True,max_df=0.95,min_df=10,max_features=1000)
# X_desc = desc_tfidf.fit_transform(full_df['item_description'].values)
wb_desc = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],"hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,"idf": None}), procs=1)
wb_desc.dictionary_freeze= True
X_desc = wb_desc.fit_transform(full_df['item_description'])



Normalize text
Extract wordbags
Normalize text
Extract wordbags


In [13]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(full_df['brand_name'])

wb = CountVectorizer()
X_category1 = wb.fit_transform(full_df['subcat_0'])
X_category2 = wb.fit_transform(full_df['subcat_1'])
X_category3 = wb.fit_transform(full_df['subcat_2'])
X_others = full_df[['shipping','item_condition_id','desc_len','name_len']].values

X_name.shape,X_desc.shape,X_category1.shape,X_category2.shape,X_category3.shape,X_others.shape
X_others[0],X_category1[0]



(array([1, 3, 0, 7]), <1x14 sparse matrix of type '<class 'numpy.int64'>'
 	with 1 stored elements in Compressed Sparse Row format>)

In [14]:
# merge = np.hstack((X_name,X_desc,X_category1,X_category2,X_category3,X_others))
merge = hstack((X_name,X_desc,X_brand,X_category1,X_category2,X_category3,csr_matrix(X_others))).tocsr()

# only get columns where there is at least 1 element
mask= np.where(merge.getnnz(axis=0) > 1)[0]
merge = merge[:,mask]

X_train = merge[:nrow_test]
X_test = merge[nrow_test:]
# y_train = full_df['target'].values
y_train = np.array(full_df.iloc[:nrow_test,-1])

In [None]:
merge.shape[1]

3302410

In [None]:
model_ftrl = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=merge.shape[1], iters=50, inv_link="identity", threads=8)
model_ftrl.fit(X_train, y_train)
y_pred_ftrl = model_ftrl.predict(X_test)


In [None]:
model_fm_ftrl = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=8)
model_fm_ftrl.fit(X_train, y_train)
y_pred_fm_ftrl = model_fm_ftrl.predict(X_test)

In [None]:
import lightgbm as lgb
# http://lightgbm.readthedocs.io/en/latest/Python-API.html
X_train_lgb, X_valid_lgb, y_train_lgb, y_valid_lgb = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
d_train = lgb.Dataset(X_train_lgb, label=y_train_lgb)
d_valid = lgb.Dataset(X_valid_lgb, label=y_valid_lgb)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression', # 'binary'
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


clf = lgb.train(params, d_train, valid_sets=d_valid,num_boost_round=500,early_stopping_rounds=50)
y_pred1=clf.predict(X_test)

In [None]:
y_preds = (np.expm1(y_pred_ftrl) + np.expm1(y_pred_fm_ftrl))/2
preds_df = pd.DataFrame(y_preds)
preds_df.reset_index(inplace=True)
preds_df.columns = ['test_id','price']
preds_df.to_csv('ensemble_submit_1.csv',index=False)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
preds_df = pd.DataFrame(y_pred_ftrl)
preds_df.reset_index(inplace=True)
preds_df.columns = ['test_id','price']

preds_df['price'] = preds_df.price.apply(lambda x:np.expm1(x))
preds_df.to_csv('ftrl_submit_1.csv',index=False)


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Try Xgboost , Lightgbm ,wordbatch

In [None]:
# print("Fitting Ridge model on training examples...")
# ridge_model = Ridge(
#     solver='auto', fit_intercept=True, alpha=1.0,
#     max_iter=100, normalize=False, tol=0.05, random_state = 1,
# )
# ridge_modelCV = RidgeCV(
#     fit_intercept=True, alphas=[5.0],
#     normalize=False, cv = 2, scoring='neg_mean_squared_error',
# )
# ridge_model.fit(X_train, Y_train)
# ridge_modelCV.fit(X_train, Y_train)

# Y_dev_preds_ridge = ridge_model.predict(X_dev)
# Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
# print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge))

# Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev)
# Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1)
# print("CV RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridgeCV))

In [None]:
# https://github.com/anttttti/Wordbatch