In [1]:

import numpy as np
import pandas as pd 

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
print('loading train')
train = pd.read_csv('input/train.csv')
train_rows = len(train)
print(train_rows)

print('loading test')
df = pd.concat([train, pd.read_csv('input/test.csv')])

loading train
1503424
loading test


In [3]:
categorical = ['item_id', 'user_id', 'region', 'city', 'parent_category_name', 'category_name',  'item_seq_number', 'user_type']
text = ['title', 'description']
target = 'deal_probability'

for c in categorical:
    print(c)
    le = preprocessing.LabelEncoder()
    df[c] = le.fit_transform(df[c])

item_id
user_id
region
city
parent_category_name
category_name
item_seq_number
user_type


In [4]:
vectorizers = []
for c in text:
    print('fitting %s' % c)
    v = TfidfVectorizer(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))
    v.fit(df[c].fillna(''))
    vectorizers.append(v)
print('.')

fitting title
fitting description
.


In [5]:
print('title')
title = vectorizers[0].transform(df.loc[:, 'title'].fillna('').values)

print('desc')
desc = vectorizers[1].transform(df.loc[:, 'description'].fillna('').values)

print('.')

title
desc
.


In [6]:
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

X = hstack([df[categorical], df[['price']], title, desc]).tocsr()
y = df[target]

X_train, y_train = X[:train_rows, :], y[:train_rows]
X_test = X[train_rows:, :]
print(X_train.shape, X_test.shape)


(1503424, 200009) (508438, 200009)


In [7]:
import lightgbm as lgb
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.3,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 0
}

In [8]:
from sklearn.model_selection import KFold, StratifiedKFold

y_pred = np.zeros((len(df) - train_rows, ), dtype=np.float32)

kf = KFold(n_splits=5, random_state=0)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print('fold', fold+1)
    X_fold, y_fold = X_train[train_idx, :], y_train[train_idx]
    X_valid, y_valid = X_train[val_idx, :], y_train[val_idx]
    
    lgb_train = lgb.Dataset(X_fold, y_fold, categorical_feature=[idx for idx, name in enumerate(categorical)], free_raw_data=False)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    print('train.')
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=3000,
                    valid_sets=[lgb_train, lgb_valid],
                    early_stopping_rounds=100,
                    verbose_eval=50)
    
    y_pred += gbm.predict(X_test, num_iteration=gbm.best_iteration)    
    fold += 1
    
y_pred /= 5


fold 1
train.




Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 0.225699	valid_1's rmse: 0.227623
[100]	training's rmse: 0.222419	valid_1's rmse: 0.226209
[150]	training's rmse: 0.219883	valid_1's rmse: 0.225506
[200]	training's rmse: 0.218031	valid_1's rmse: 0.225107
[250]	training's rmse: 0.216388	valid_1's rmse: 0.224856
[300]	training's rmse: 0.214931	valid_1's rmse: 0.224739
[350]	training's rmse: 0.213673	valid_1's rmse: 0.224636
[400]	training's rmse: 0.21242	valid_1's rmse: 0.224571
[450]	training's rmse: 0.211338	valid_1's rmse: 0.22453
[500]	training's rmse: 0.210131	valid_1's rmse: 0.224508
[550]	training's rmse: 0.209122	valid_1's rmse: 0.224498
[600]	training's rmse: 0.208081	valid_1's rmse: 0.224558
Early stopping, best iteration is:
[548]	training's rmse: 0.209155	valid_1's rmse: 0.224496
fold 2
train.
Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 0.225839	valid_1's rmse: 0.228073
[100]	training's rmse: 0.222422

In [9]:
sample_submission = pd.read_csv('input/sample_submission.csv')
sample_submission['deal_probability'] = y_pred
sample_submission.to_csv('sample.csv', index=False)