In [2]:
import pandas as pd
import numpy as np

from math import sin, cos, sqrt, atan2, radians, degrees, fabs
import string as str
import re
from random import randint

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection 
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture


import xgboost as xgb
from nltk.stem import PorterStemmer

In [3]:
#reading data
train_df = pd.read_json("train.json")
test_df = pd.read_json("test.json")

#creating test_train and y_train
y_train = train_df.interest_level
test_df.interest_level = -1
train_test = pd.concat([train_df,test_df])

y_train_dummies = pd.get_dummies(y_train, prefix = 'interest')
emp_bayes = pd.read_csv('emp_bayes.csv').set_index(train_test.index)

In [4]:
features_to_use= []

#clipping outliers
train_test['bathrooms'] = train_test.bathrooms.clip_upper(5)
train_test['bedrooms'] = train_test.bedrooms.clip_upper(5)
train_test['price'] = train_test.price.clip_upper(20000)

features_to_use.extend(['bathrooms','bedrooms','price'])

# Composite features based on: 
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths

train_test['num_priceXroom'] = (train_test.price / (1 + train_test.bedrooms.clip(1, 4) + 0.5*train_test.bathrooms.clip(0, 2))).values
features_to_use.append('num_priceXroom')

In [5]:
#empirical bayes
train_test = pd.merge(train_test,emp_bayes, left_index=True, right_index=True)
features_to_use.extend(emp_bayes.columns)

In [6]:
# count of photos 
train_test["num_photos"] = train_test["photos"].apply(len)
features_to_use.append('num_photos')

# is bulding id present or not
train_test['building_id_present'] = train_test['building_id'].apply(lambda x: 0 if x == '0' else 1)
features_to_use.append('building_id_present')

# #stores which have listing higher than 7235000 always have low interest
train_test['bad_listing'] = train_test['listing_id'].apply(lambda x: 1 if x>7235000 else 0)
features_to_use.append('bad_listing')


In [7]:
#importing clean features
features_dup = pd.read_csv("feature_duplicate.csv")

#creating a dictinary of deduplicated features
features = train_test[["features"]].apply(lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])
features_dict = features_dup.set_index('original_feature')['unique_feature'].to_dict()

def features_map_func(x):
    temp_list = []
    for i in x:
        if i in features_dict.keys():
            temp_list.append(features_dict[i])
    return temp_list

#cleaning up features
train_test['new_features'] = features.features.apply(lambda x : features_map_func(x))
train_test['new_features'] = train_test["new_features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

#count vectorizing features
count_vec = CountVectorizer(stop_words='english', max_features=100)

train_test_count_vec_feat = pd.DataFrame(count_vec.fit_transform(train_test["new_features"]).todense(), columns = [s + '_feat' for s in count_vec.vocabulary_.keys()] ).set_index(train_test.index)
train_test = train_test.merge(train_test_count_vec_feat,how='left', left_index = True, right_index = True)

features_to_use.extend([s + '_feat' for s in count_vec.vocabulary_.keys()] )



In [8]:
#clustering neighbourhood

lat_long = train_test[train_test.longitude> -74.05][train_test.longitude< -73.875][train_test.latitude> 40.63][train_test.latitude< 40.87]
cluster = lat_long[['latitude','longitude']]

model_gm = GaussianMixture(n_components=40, covariance_type='full',tol = 0.01, max_iter=5000, random_state=7, verbose=0)
pred_gm = pd.DataFrame(model_gm.fit(cluster).predict(cluster)).set_index(cluster.index)
pred_gm.columns = ['pred_gm']


train_test = pd.merge(train_test, pred_gm, how = 'left', left_index=True, right_index=True)
train_test.pred_gm[train_test.pred_gm.isnull()] = -1

dummy_neighbourhood = pd.get_dummies(train_test.pred_gm, prefix = 'dummy_nb_')

train_test = train_test.merge(dummy_neighbourhood, how='left', left_index = True, right_index = True)

features_to_use.extend(dummy_neighbourhood.columns)

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
### Calculate the distance of all latitude & longitude from the city center 

# approximate radius of earth in km
R = 6373.0

location_dict = {
'manhatten_loc' : [40.7527, -73.9943],
'brooklyn_loc' : [45.0761,-73.9442],
'bronx_loc' : [40.8448,-73.8648],
'queens_loc' : [40.7282,-73.7949],
'staten_loc' : [40.5795,-74.1502]}

for location in location_dict.keys():

    lat1 = train_test['latitude'].apply(radians)
    lon1 = train_test['longitude'].apply(radians)
    lat2 = radians(location_dict[location][0])
    lon2 = radians(location_dict[location][1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    def power(x):
        return x**2

    a = (dlat/2).apply(sin).apply(power) + lat1.apply(cos) * cos(lat2) * (dlon/2).apply(sin).apply(power)
    c = 2 * a.apply(sqrt).apply(sin)

    ### Add a new column called distance
    train_test['distance_' + location] = R * c
    features_to_use.append('distance_' + location)
    
    x = lat1.apply(cos)*sin(lat2) - lat1.apply(sin) * cos(lat2) * (lon2 - lon1).apply(cos)
    y = (lon2 - lon1).apply(sin)* cos(lat2)

    ### Create a new colum as degrees
    train_test['degrees_' + location] = (np.arctan2(x,y)).apply(degrees).apply(fabs)

In [10]:
#unique buildings managers and addresses

def find_objects_with_only_one_record(feature_name):
    temp = train_test[feature_name].reset_index()
    temp = temp.groupby(feature_name, as_index = False).count()
    return temp[temp['index'] == 1]

managers_with_one_lot = find_objects_with_only_one_record('manager_id')
buildings_with_one_lot = find_objects_with_only_one_record('building_id')
addresses_with_one_lot = find_objects_with_only_one_record('display_address')

train_test['manager_id_unique'] = 0
train_test['building_id_unique'] = 0
train_test['display_address_unique'] = 0

train_test.loc[train_test['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 'manager_id_unique'] = 1

train_test.loc[train_test['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 'building_id_unique'] = 1
train_test['building_id_unique'] =  train_test[['building_id','building_id_unique']].apply(lambda x : 1 if x[0] == '0' else x[1], axis=1)
train_test.loc[train_test['display_address'].isin(addresses_with_one_lot['display_address'].ravel()),'display_address_unique'] = 1
categorical = ['building_id_unique', 'manager_id_unique','display_address_unique']

features_to_use.extend(['building_id_unique', 'manager_id_unique','display_address_unique'])

In [11]:
# replacing ids by counts

categorical = ['building_id', 'manager_id','display_address']
                   
for f in categorical:
    encoder = LabelEncoder()
    encoder.fit(list(train_test[f])) 
    train_test[f] = encoder.transform(train_test[f].ravel())
    
temp = train_test.manager_id.value_counts()
train_test['manager_id_count'] = train_test.manager_id.apply(lambda x: temp[x])

temp = train_test.building_id.value_counts()
train_test['building_id_count'] = train_test.building_id.apply(lambda x: temp[x])

temp = train_test.display_address.value_counts()
train_test['display_address_count'] = train_test.display_address.apply(lambda x: temp[x])


train_test.building_id_count = train_test[['building_id_count', 'building_id_unique']].apply(lambda x : 1 if x[1] == 1 else x[0], axis=1)
train_test.manager_id_count = train_test[['manager_id_count', 'manager_id_unique']].apply(lambda x : 1 if x[1] == 1 else x[0], axis=1)
train_test.display_address_count = train_test[['display_address_count', 'display_address_unique']].apply(lambda x : 1 if x[1] == 1 else x[0], axis=1)

features_to_use.extend(['manager_id_count','building_id_count','display_address_count'])

In [12]:
features_to_use.extend(['latitude','longitude','listing_id'])

In [13]:
#median price grouped by managers and neighbourhoods and bedrooms

train_test['median_price_groupby_manager'] = train_test.groupby(['manager_id','bedrooms'])['price'].transform('median') 
features_to_use.append('median_price_groupby_manager')

train_test['median_price_groupby_neighbourhood'] = train_test.groupby(['pred_gm','bedrooms'])['price'].transform('median') - train_test.price
features_to_use.append('median_price_groupby_neighbourhood')

In [14]:
# extract  features like month, day, hour from date columns and categorizing them, year not taken as every obs is from 2016
train_test["created"] = pd.to_datetime(train_test["created"])

train_test['month'] =train_test["created"].dt.month
train_test['day_of_week'] =train_test["created"].dt.weekday
train_test['day_of_month'] =train_test["created"].dt.day
train_test['hour_of_day'] =train_test["created"].dt.hour

features_to_use.extend(['month','day_of_week','day_of_month','hour_of_day'])

In [15]:
import random

train_df = train_test.ix[train_df.index]
test_df = train_test.ix[test_df.index]

index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [16]:
train_test = pd.concat([train_df,test_df])


In [17]:
# features_to_model = forest_features
features_to_model = features_to_use

In [18]:
target_num_map = {'high':0, 'medium':1, 'low':2}
y_train = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

new_feature = pd.read_csv('listing_image_time.csv')

new_feature['listing_id'] = new_feature['Listing_Id']

new_feature.ix[train_df.index]

all_data =  pd.merge(train_test, new_feature, on='listing_id', how='left')

features_to_use.append('time_stamp')

features_to_model = features_to_use

all_data.set_index(train_test.index, inplace = True)

X_train = all_data.ix[train_df.index][features_to_model]
X_test = all_data.ix[test_df.index][features_to_model]



In [19]:
X_train.head()

Unnamed: 0,bathrooms,bedrooms,price,num_priceXroom,building_id_mean_medium,building_id_mean_high,manager_id_mean_medium,manager_id_mean_high,num_photos,building_id_present,...,median_price_groupby_manager,median_price_groupby_neighbourhood,month,day_of_week,day_of_month,hour_of_day,manager_level_low,manager_level_medium,manager_level_high,time_stamp
10,1.5,3,3000,631.578947,0.217575,0.0743694,0.283585,9.190504000000001e-29,5,1,...,2800.0,300.0,6,4,24,7,0.753425,0.246575,0.0,1478091590
10000,1.0,2,5465,1561.428571,0.110642,1.761506e-07,0.012447,2.088626e-34,11,1,...,5445.0,-1290.0,6,6,12,12,0.985915,0.014085,0.0,1478129766
100004,1.0,1,2850,1140.0,0.467347,0.09291841,0.357575,0.03169581,8,1,...,2862.5,775.0,4,6,17,3,0.596154,0.375,0.028846,1478714436
100007,1.0,1,3275,1310.0,0.094998,0.01187783,0.152256,0.08444808,3,1,...,3195.0,-200.0,4,0,18,2,0.804054,0.135135,0.060811,1478714444
100013,1.0,4,3350,609.090909,0.058909,0.02270975,0.000564,0.000192214,3,0,...,5000.0,440.0,4,3,28,1,1.0,0.0,0.0,1478714464


In [343]:
features_to_use

['bathrooms',
 'bedrooms',
 'price',
 'num_priceXroom',
 'building_id_mean_medium',
 'building_id_mean_high',
 'manager_id_mean_medium',
 'manager_id_mean_high',
 'num_photos',
 'building_id_present',
 'bad_listing',
 u'exclusive_feat',
 u'outdoor_feat',
 u'furnished_feat',
 u'walk_in_closet_feat',
 u'ft_doorman_feat',
 u'terrace_feat',
 u'newly_renovated_feat',
 u'duplex_feat',
 u'dryer_in_unit_feat',
 u'green_building_feat',
 u'private_backyard_feat',
 u'multi_level_feat',
 u'garden_feat',
 u'rise_feat',
 u'park_view_feat',
 u'fireplace_feat',
 u'eat_in_kitchen_feat',
 u'dish_washer_feat',
 u'granite_counter_tops_feat',
 u'stainless_steel_feat',
 u'site_laundry_feat',
 u'garage_feat',
 u'common_feat',
 u'fitness_feat',
 u'dining_room_feat',
 u'view_feat',
 u'hardwood_feat',
 u'fitness_center_feat',
 u'space_feat',
 u'microwave_feat',
 u'valet_feat',
 u'pets_feat',
 u'short_term_allowed_feat',
 u'new_construction_feat',
 u'_photos_feat',
 u'reduced_fee_feat',
 u'common_parking_feat',


In [19]:
# XGBoost
X_dev, X_val, y_dev, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=3 )

In [77]:
param = {}
param['objective'] = 'multi:softprob'
param['eta'] = .02
param['gamma'] = 0.8
param['max_depth'] = 8
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['reg_alpha'] = 2.0
param['min_child_weight'] = 15
param['subsample'] = 0.99
param['colsample_bytree'] = 0.45
param['seed'] = 3
param['nthread'] = 4
param['n_jobs'] = -1
num_rounds = 5000

In [78]:
plst = list(param.items())

xgtrain = xgb.DMatrix(X_dev, label=y_dev)
xgtest = xgb.DMatrix(X_val, label=y_val)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50,verbose_eval = 100)
#0.514 with eta 0.02 max_depth 4 min_child 1 lb 0.53 rounds 3700
#current benchmark with eta 0.5 = 0.5152
# test-mlogloss:0.513764
# train-mlogloss:0.422563	test-mlogloss:0.5131
# train-mlogloss:0.402687	test-mlogloss:0.512209
# train-mlogloss:0.365848	test-mlogloss:0.511613
# train-mlogloss:0.360063	test-mlogloss:0.511188
# train-mlogloss:0.315602	test-mlogloss:0.51029

[0]	train-mlogloss:1.08457	test-mlogloss:1.08487
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.586085	test-mlogloss:0.614344
[200]	train-mlogloss:0.502683	test-mlogloss:0.551517
[300]	train-mlogloss:0.466779	test-mlogloss:0.533091
[400]	train-mlogloss:0.441403	test-mlogloss:0.525008
[500]	train-mlogloss:0.421592	test-mlogloss:0.520142
[600]	train-mlogloss:0.403526	test-mlogloss:0.517117
[700]	train-mlogloss:0.388421	test-mlogloss:0.515364
[800]	train-mlogloss:0.374782	test-mlogloss:0.513834
[900]	train-mlogloss:0.36222	test-mlogloss:0.512846
[1000]	train-mlogloss:0.351079	test-mlogloss:0.512033
[1100]	train-mlogloss:0.340178	test-mlogloss:0.51133
[1200]	train-mlogloss:0.330718	test-mlogloss:0.510794
[1300]	train-mlogloss:0.321947	test-mlogloss:0.510422
[1400]	train-mlogloss:0.31362	test-mlogloss:0.510337
Stopping. Best iteration:
[1377]	train-mlogloss:0.315602

In [79]:
num_rounds = 1380
xgtrain = xgb.DMatrix(X_train, label=y_train)
watchlist = [ (xgtrain,'train') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist,verbose_eval = 100)

[0]	train-mlogloss:1.08468
[100]	train-mlogloss:0.587473
[200]	train-mlogloss:0.505938
[300]	train-mlogloss:0.47181
[400]	train-mlogloss:0.448499
[500]	train-mlogloss:0.429687
[600]	train-mlogloss:0.413446
[700]	train-mlogloss:0.399326
[800]	train-mlogloss:0.386669
[900]	train-mlogloss:0.375433
[1000]	train-mlogloss:0.364755
[1100]	train-mlogloss:0.354768
[1200]	train-mlogloss:0.345388
[1300]	train-mlogloss:0.336314
[1379]	train-mlogloss:0.329919


In [38]:
# xgboost submission
forest_xgb = model.predict(xgb.DMatrix(X_test))
submission = pd.DataFrame(forest_xgb, columns = ['high','medium','low']).set_index(test_df.index)
submission['listing_id'] = test_df.listing_id.values
submission.to_csv('model_9_xgb_reg2_mc15_eta0.02_1380_ss0.99.csv', index= False)

In [None]:
param = {}
param['objective'] = 'multi:softprob'
param['eta'] = .02
param['gamma'] = 0.8
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['reg_alpha'] = 2
param['min_child_weight'] = 10
param['subsample'] = 0.99
param['colsample_bytree'] = 0.5
param['seed'] = 3
param['nthread'] = 4
param['n_jobs'] = -1
num_rounds = 5000

# param = {}
# param['objective'] = 'multi:softprob'
# param['eta'] = .02
# param['gamma'] = 0.8
# param['max_depth'] = 8
# param['silent'] = 1
# param['num_class'] = 3
# param['eval_metric'] = "mlogloss"
# param['reg_alpha'] = 2.0
# param['min_child_weight'] = 15
# param['subsample'] = 0.99
# param['colsample_bytree'] = 0.45
# param['seed'] = 3
# param['nthread'] = 4
# param['n_jobs'] = -1
# num_rounds = 5000

In [27]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# x_train = np.array(X_train)
# x_test = np.array(X_test)

X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.fillna(0)

X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)

x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

print x_train.shape
print x_test.shape

(49352, 176)
(74659, 176)


In [29]:
tsne_train = np.loadtxt("stnet/tr_tsne.csv", delimiter=",")
tsne_test = np.loadtxt("stnet/test_tsne.csv", delimiter=",")

# print x_train.shape
# print x_test.shape

x_train = np.column_stack((x_train,tsne_train))
x_test = np.column_stack((x_test,tsne_test))

print x_train.shape
print x_test.shape


(49352, 186)
(74659, 186)


In [32]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = .05
    param['gamma'] = 0.0
    param['max_depth'] = 7
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['reg_alpha'] = 2.0
    param['reg_lambda'] = 1.0
    param['min_child_weight'] = 10
    param['subsample'] = 0.99
    param['colsample_bytree'] = 0.55
    param['seed'] = 3
#     param['nthread'] = 4
    param['n_jobs'] = -1
#     num_rounds = 5000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval = 1000)
    else:
        xgtest = xgb.DMatrix(test_X)
        watchlist = [ (xgtrain,'train') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, verbose_eval = 100)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [33]:
from sklearn import model_selection
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

# x_train = np.array(X_train)
# x_test = np.array(X_test)

train_stacker=[ [0.0 for s in range(3)]  for k in range (0,(x_train.shape[0])) ]
test_stacker=[[0.0 for s in range(3)]   for k in range (0,(x_test.shape[0]))]
X = np.array(X_train)
cv_scores = []
oof_preds = []
# StratifiedKFold
# kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(x_train.shape[0])):
        dev_X, val_X = x_train[dev_index,:], x_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
#         oof_preds.append(preds)
#         predictions = bst.predict(X1cv)     
#         predictions = preds.reshape( val_X.shape[0], 3)
        print(cv_scores)
        break
#         no=0
#         for real_index in val_index:
#             for d in range (0,3):
#                 train_stacker[real_index][d]=(predictions[no][d])
#             no+=1
#         X = np.column_stack((X,train_stacker))

# [2049]	train-mlogloss:0.354939	test-mlogloss:0.515924  
# Stopping. Best iteration:
# [3543]	train-mlogloss:0.396121	test-mlogloss:0.512792


# [0.5162835560663217]
# [0.51378179879688479]
#[0.51489777736957554]
# [0.51494246542330147]

# [0.51531373368992195]

[0]	train-mlogloss:1.06307	test-mlogloss:1.06431
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[432]	train-mlogloss:0.325519	test-mlogloss:0.520164

[0.5203930115408385]


In [34]:
from sklearn import model_selection
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

# x_train = np.array(X_train)
# x_test = np.array(X_test)
train_stacker=[ [0.0 for s in range(3)]  for k in range (0,(x_train.shape[0])) ]
test_stacker=[[0.0 for s in range(3)]   for k in range (0,(x_test.shape[0]))]
X = np.array(X_train)
cv_scores = []
oof_preds = []
# StratifiedKFold
# kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(x_train.shape[0])):
        dev_X, val_X = x_train[dev_index,:], x_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
#         oof_preds.append(preds)
#         predictions = bst.predict(X1cv)     
        predictions = preds.reshape( val_X.shape[0], 3)
        print(cv_scores)
        no=0
        for real_index in val_index:
            for d in range (0,3):
                train_stacker[real_index][d]=(predictions[no][d])
            no+=1
#         X = np.column_stack((X,train_stacker))

# [2049]	train-mlogloss:0.354939	test-mlogloss:0.515924  
# Stopping. Best iteration:
# [3543]	train-mlogloss:0.396121	test-mlogloss:0.512792

# [0.51282974902487233]

[0]	train-mlogloss:1.06307	test-mlogloss:1.06431
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[432]	train-mlogloss:0.325519	test-mlogloss:0.520164

[0.5203930115408385]
[0]	train-mlogloss:1.06313	test-mlogloss:1.06427
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[539]	train-mlogloss:0.297417	test-mlogloss:0.515194

[0.5203930115408385, 0.51537463923821591]
[0]	train-mlogloss:1.06331	test-mlogloss:1.06444
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[411]	train-mlogloss:0.333228	test-mlogloss:0.521438

[0.5203930115408385, 0.51537463923821591, 0.52156811150757121]
[0]	train-mlogloss:1.06336	test-ml

In [35]:
train_stacker

np.savetxt("stnet/train_stacker_xgb_tsne.csv", train_stacker, delimiter=",", fmt='%.6f')


In [363]:
np.savetxt("stnet/train_data.csv", x_train, delimiter=",", fmt='%.6f')
np.savetxt("stnet/test_data.csv", x_test, delimiter=",", fmt='%.6f')


In [332]:
# t_file = pd.read_csv('model_xgb_6_md6_reg2_mc10_eta0.02_1850.csv')

# t_file.head()

# test_stacker = np.array(t_file)

np.savetxt("stnet/test_stacker_xgb_1.csv", test_stacker[:,0:3], delimiter=",", fmt='%.6f')


In [339]:
# X_train.head()

# train_ts = np.array(X_train)

# train_ts[:,175:176]

# test_ts = np.array(X_test)

# np.savetxt("stnet/train_nf.csv", train_ts[:,175:176], delimiter=",", fmt='%.6f')
# np.savetxt("stnet/test_nf.csv", test_ts[:,175:176], delimiter=",", fmt='%.6f')

F = np.loadtxt("stnet/train_nf.csv", delimiter=",")
F_ts = np.loadtxt("stnet/test_nf.csv", delimiter=",")

print F.shape
print F_ts.shape

(49352,)
(74659,)


In [36]:
preds, model = runXGB(X_train, y_train, X_test, num_rounds=432)
np.savetxt("stnet/test_stacker_xgb_tsne.csv",preds, delimiter=",", fmt='%.6f')

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("model_xgb_tsne.csv", index=False)

# forest_xgb = model.predict(xgb.DMatrix(X_test))
# submission = pd.DataFrame(preds, columns = ['high','medium','low']).set_index(test_df.index)
# submission['listing_id'] = test_df.listing_id.values
# submission.to_csv('model_10_xgb_reg2_mc10_eta0.02_2050.csv', index= False)

[0]	train-mlogloss:1.06301
[100]	train-mlogloss:0.488474
[200]	train-mlogloss:0.435992
[300]	train-mlogloss:0.400098
[400]	train-mlogloss:0.371235
[431]	train-mlogloss:0.363607


In [244]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# x_train = np.array(X_train)
# x_test = np.array(X_test)

X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.fillna(0)

X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)

x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

print x_train.shape
print x_test.shape

(49352, 175)
(74659, 175)


In [126]:
# X_ts = np.array(X_test)

X_ts = np.column_stack((X_ts,F_ts[:,176:179]))

# X_ts.shape

X = np.column_stack((np.array(y_train),X,train_stacker))

X_ts = np.column_stack((np.array(X_test['listing_id']),X_ts))
                       
print X.shape
print X_ts.shape                       
                

(49352, 179)
(74659, 179)


In [128]:
train_file="stnet/train6.csv"
test_file="stnet/test6.csv"


print ("exporting files")
np.savetxt(train_file, X, delimiter=",", fmt='%.5f')
np.savetxt(test_file, X_ts, delimiter=",", fmt='%.5f')    

exporting files


In [191]:
# java -Xmx4048m -jar StackNet.jar train train_file=train5.csv test_file=test5.csv params=paramssimplev1.txt pred_fil
# e=sigma_stack_pred_8.csv test_target=true verbose=true Threads=6 stackdata=false folds=5 seed=1 metric=logloss
x_train

array([[  1.50000000e+00,   3.00000000e+00,   3.00000000e+03, ...,
          7.71428571e-01,   2.28571429e-01,   0.00000000e+00],
       [  1.00000000e+00,   2.00000000e+00,   5.46500000e+03, ...,
          9.82758621e-01,   1.72413793e-02,   0.00000000e+00],
       [  1.00000000e+00,   1.00000000e+00,   2.85000000e+03, ...,
          5.92592593e-01,   3.42592593e-01,   6.48148148e-02],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   2.59500000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   3.35000000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   2.00000000e+00,   2.20000000e+03, ...,
          9.33333333e-01,   6.66666667e-02,   0.00000000e+00]])

In [158]:
# F_ts = np.loadtxt("stnet/test5.csv", delimiter=",")
# F = np.loadtxt("stacknet/train_stacknet.csv", delimiter=",")
# F_test = np.loadtxt("stacknet/test_stacknet.csv", delimiter=",")

# load results of one (or more) of your models - here also 
# out-of-fold predictions for the train set are needed
# pred_test = np.loadtxt("meta_predictions/preds_test.txt")
# pred_train = np.loadtxt("meta_predictions/preds_train.txt")
# F_ts[:,176:179]
# # # combine them with the StackNet input
# F_test = np.column_stack((F_test, F_ts[:,176:179]))
# F = np.column_stack((F, train_stacker))
# print F.shape
# print F_test.shape
# save into new files to be fed into StackNet

# np.savetxt("stacknet/train_stacknet_with_meta.csv", F, delimiter=",", fmt='%.6f')
# np.savetxt("stacknet/test_stacknet_with_meta.csv", F_test, delimiter=",", fmt='%.6f')

# F = np.loadtxt("stacknet/train_stacknet_with_meta.csv", delimiter=",")
# F_ts = np.loadtxt("stacknet/test_stacknet_with_meta.csv", delimiter=",")

# G = scaler.fit_transform(X_train)
# G_ts = scaler.transform(X_test)

# F = np.column_stack((F, G))
# F_ts = np.column_stack((F_ts,G_ts))

# print F.shape
# print F_ts.shape

F = np.loadtxt("stnet/train_with_add_data.csv", delimiter=",")
F_ts = np.loadtxt("stnet/test_with_add_data.csv", delimiter=",")

F = np.column_stack((F, train_stacker))
F_ts = np.column_stack((F_ts, preds_better[:,0:3]))

np.savetxt("stnet/train_with_add_data_0.529.csv", F, delimiter=",", fmt='%.6f')
np.savetxt("stnet/test_with_add_data_0.529.csv", F_ts, delimiter=",", fmt='%.6f')


In [160]:
# np.savetxt("stnet/train_stacker_0.5291.csv", train_stacker, delimiter=",", fmt='%.6f')
# preds_new = pd.read_csv('model_15_xgb_md8_reg2_mc10_eta0.02_1460.csv')

# preds_better = np.array(preds_new)

np.savetxt("stnet/test_stacker_0.5291.csv", preds_better[:,0:3], delimiter=",", fmt='%.6f')


In [43]:
X = np.loadtxt("stnet/train_with_add_data_0.529.csv", delimiter=",")
X_ts = np.loadtxt("stnet/test_with_add_data_0.529.csv", delimiter=",")

X_nn = np.loadtxt("stnet/train_stacker_reg_nn.csv", delimiter=",")
X_ts_nn = np.loadtxt("stnet/test_stacker_reg_nn.csv", delimiter=",")

X_rf = np.loadtxt("stnet/train_stacker_rf.csv",  delimiter=",")
X_ts_rf = np.loadtxt("stnet/test_stacker_rf.csv", delimiter=",")

X_lgr = np.loadtxt("stnet/train_stacker_lgr.csv", delimiter=",")
X_ts_lgr = np.loadtxt("stnet/test_stacker_lgr.csv", delimiter=",")

F = np.column_stack((X, X_nn, X_rf, X_lgr))
F_ts = np.column_stack((X_ts, X_ts_nn, X_ts_rf, X_ts_lgr))

np.savetxt("stnet/train_meta_xgb3_rf_nn_lgr.csv", F, delimiter=",", fmt='%.6f')
np.savetxt("stnet/test_meta_xgb3_rf_nn_lgr.csv", F_ts, delimiter=",", fmt='%.6f')


In [75]:
G = np.loadtxt("stnet/train_meta_xgb3_rf_nn_lgr.csv", delimiter=",")
G_ts = np.loadtxt("stnet/test_meta_xgb3_rf_nn_lgr.csv", delimiter=",")

lgrtf = np.loadtxt("stnet/train_stacker_lgr_tf.csv", delimiter=",")
lgrtf_ts = np.loadtxt("stnet/test_stacker_lgr_tf.csv", delimiter=",")

m = np.column_stack((G,lgrtf))
m_ts = np.column_stack((G_ts,lgrtf_ts))

np.savetxt("stnet/train_add_3xgb_nn_rf_2lr.csv", m, delimiter=",", fmt='%.6f')
np.savetxt("stnet/test_add_3xgb_nn_rf_2lr.csv", m_ts, delimiter=",", fmt='%.6f')


In [241]:
G = np.loadtxt("new_st/train_add_3xgb_nn_rf_2lr_7.csv", delimiter=",")
G_ts = np.loadtxt("new_st/test_add_3xgb_nn_rf_2lr_7.csv", delimiter=",")

# gbm = np.loadtxt("stnet/train_stacker_gbm.csv", delimiter=",")
# gbm_ts = np.loadtxt("stnet/test_stacker_gbm.csv", delimiter=",")

# knn = np.loadtxt("stnet/train_stacker_knn.csv", delimiter=",")
# knn_ts = np.loadtxt("stnet/test_stacker_knn.csv", delimiter=",")

# sgd = np.loadtxt("stnet/train_stacker_sgd.csv", delimiter=",")
# sgd_ts = np.loadtxt("stnet/test_stacker_sgd.csv", delimiter=",")

# sgdtf = np.loadtxt("stnet/train_stacker_sgd_tf.csv", delimiter=",")
# sgdtf_ts = np.loadtxt("stnet/test_stacker_sgd_tf.csv", delimiter=",")

# lda = np.loadtxt("stnet/train_stacker_lda.csv", delimiter=",")
# lda_ts = np.loadtxt("stnet/test_stacker_lda.csv", delimiter=",")

# knn_meta = np.loadtxt("stnet/train_stacker_knn_meta_1k.csv", delimiter=",")
# knn_meta_ts = np.loadtxt("stnet/test_stacker_knn_meta_1k.csv", delimiter=",")

# et = np.loadtxt("stnet/train_stacker_et_meta_md10.csv", delimiter=",")
# et_ts = np.loadtxt("stnet/test_stacker_et_meta_md10.csv", delimiter=",")

knn_meta1 = np.loadtxt("stnet/train_stacker_knn_meta_0.5k.csv", delimiter=",")
knn_meta1_ts = np.loadtxt("stnet/test_stacker_knn_meta_0.5k.csv", delimiter=",")

knn_meta2 = np.loadtxt("stnet/train_stacker_knn_meta_0.25k.csv", delimiter=",")
knn_meta2_ts = np.loadtxt("stnet/test_stacker_knn_meta_0.25k.csv", delimiter=",")

et1 = np.loadtxt("stnet/train_stacker_et_meta_md5.csv", delimiter=",")
et1_ts = np.loadtxt("stnet/test_stacker_et_meta_md5.csv", delimiter=",")

et2 = np.loadtxt("stnet/train_stacker_et_meta_md3.csv", delimiter=",")
et2_ts = np.loadtxt("stnet/test_stacker_et_meta_md3.csv", delimiter=",")


m = np.column_stack((G,knn_meta1,et1,knn_meta2,et2))
m_ts = np.column_stack((G_ts,knn_meta1_ts,et1_ts,knn_meta2_ts,et2_ts))

print m.shape
print m_ts.shape

# np.savetxt("stnet/train_add_3xgb_nn_rf_2lr_5.csv", m, delimiter=",", fmt='%.6f')
# np.savetxt("stnet/test_add_3xgb_nn_rf_2lr_5.csv", m_ts, delimiter=",", fmt='%.6f')

np.savetxt("new_st/train_add_3xgb_nn_rf_2lr_11.csv", m, delimiter=",", fmt='%.6f')
np.savetxt("new_st/test_add_3xgb_nn_rf_2lr_11.csv", m_ts, delimiter=",", fmt='%.6f')


(49352, 462)
(74659, 462)


In [196]:
np.savetxt("stacknet/train_add_3xgb_nn_rf_2lr_5.csv", m, delimiter=",", fmt='%.6f')
np.savetxt("stacknet/test_add_3xgb_nn_rf_2lr_5.csv", m_ts, delimiter=",", fmt='%.6f')


In [197]:
print m.shape
print m_ts.shape
# java -Xmx6048m -jar StackNet.jar train train_file=train_nf1.csv test_file=test_nf1.csv params=paramssimplev1.txt pred_file=sigma_stack_pred_nf1.csv test_target=true verbose=true Threads=9 stackdata=true folds=5 seed=1 metric=logloss

(49352, 444)
(74659, 444)


In [216]:
# pd.DataFrame(m[:,414:445]).head()
# pd.DataFrame(m[:,233:239]).head()

# df_ts = np.column_stack((m_ts[:,233:239],m_ts[:,414:445]))

# print df.shape
# print df_ts.shape

np.savetxt("stnet/meta_dftrain.csv", df, delimiter=",", fmt='%.6f')
np.savetxt("stnet/meta_dftest.csv", df_ts, delimiter=",", fmt='%.6f')


# d_train = np.loadtxt("stnet/meta_dftrain.csv", delimiter=",")
# d_test = np.loadtxt("stnet/meta_dftest.csv", delimiter=",")

# print d_train.shape
# print d_test.shape

In [73]:
# G = np.loadtxt("stacknet/train_stacknet_with_meta.csv", delimiter=",")
# G_ts = np.loadtxt("stacknet/test_stacknet_with_meta.csv", delimiter=",")

# lgrtf = np.loadtxt("stnet/train_stacker_lgr_tf.csv", delimiter=",")
# lgrtf_ts = np.loadtxt("stnet/test_stacker_lgr_tf.csv", delimiter=",")


# l = F[:,414:427]
# p = G[:,233:239]
# m = np.column_stack((p,l,lgrtf))

# l_ts = F_ts[:,414:427]
# p_ts = G_ts[:,233:239]
# m_ts = np.column_stack((p_ts,l_ts,lgrtf_ts))

# print m.shape
# print m_ts.shape

# np.savetxt("stnet/pred_train_3xgb_nn_rf_2lr.csv", m, delimiter=",", fmt='%.6f')
# np.savetxt("stnet/pred_test_3xgb_nn_rf_2lr.csv", m_ts, delimiter=",", fmt='%.6f')
# X = np.column_stack((np.array(y_train),X,train_stacker))
# X_ts = np.column_stack((np.array(X_test['listing_id']),X_ts))

x = np.column_stack((np.array(y_train),x,m))
x_ts = np.column_stack((np.array(X_test['listing_id']),x_ts,m_ts))

print x.shape
print x_ts.shape

np.savetxt("stacknet/train_norm_3xgb_nn_rf_2lr.csv", x, delimiter=",", fmt='%.6f')
np.savetxt("stacknet/test_norm_3xgb_nn_rf_2lr.csv", x_ts, delimiter=",", fmt='%.6f')

# p = pd.DataFrame(G[:,233:239])
# p.head()
# l = pd.DataFrame(F[:,414:427])
# l.head()

(49352, 200)
(74659, 200)


In [71]:
from scipy import sparse

sx = sparse.csr_matrix(x)   # Here's the initialization of the sparse matrix.
sx_test = sparse.csr_matrix(x_ts)

x = sx.toarray()
x_ts = sx_test.toarray()

np.savetxt("stacknet/train_norm_3xgb_nn_rf_2lr.csv", x, delimiter=",", fmt='%.6f')
np.savetxt("stacknet/test_norm_3xgb_nn_rf_2lr.csv", x_ts, delimiter=",", fmt='%.6f')


In [44]:
print F.shape
print F_ts.shape

java -Xmx6048m -jar StackNet.jar train train_file=train_add_3xgb_nn_rf_2lr.csv test_file=test_add_3xgb_nn_rf_2lr.csv params=paramssimplev1.txt pred_file=sigma_stack_pred_meta7_add.csv test_target=true verbose=true Threads=9 stackdata=true folds=5 seed=1 metric=logloss

(49352, 426)
(74659, 426)


In [267]:
from sklearn.preprocessing import PolynomialFeatures    
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.log1p)    
square = FunctionTransformer(np.square)    

#poly = PolynomialFeatures(degree=2)
x_train[np.isnan(x_train)] = 0
x_test[np.isnan(x_test)] = 0

x_train[np.isinf(x_train)] = 0
x_test[np.isinf(x_test)] = 0

# x_train = np.column_stack((x_train,square.transform(x_train),transformer.transform(x_train)))
# x_test = np.column_stack((x_test,square.transform(x_test),transformer.transform(x_test)))

x_train = np.column_stack((square.transform(x_train),transformer.transform(x_train)))
x_test = np.column_stack((square.transform(x_test),transformer.transform(x_test)))

x_train[np.isnan(x_train)] = 0
x_test[np.isnan(x_test)] = 0

x_train[np.isinf(x_train)] = 0
x_test[np.isinf(x_test)] = 0

for i in range(x_train.shape[1]):
    x_test[:, i] = (x_test[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])
    x_train[:, i] = (x_train[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])

x_train[np.isnan(x_train)] = 0
x_test[np.isnan(x_test)] = 0

x_train[np.isinf(x_train)] = 0
x_test[np.isinf(x_test)] = 0

print x_train.shape
print x_test.shape




(49352, 350)
(74659, 350)


In [266]:
x_train = np.array(X_train)
x_test = np.array(X_test)

In [255]:
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier 
from sklearn.linear_model import Perceptron,SGDClassifier,RidgeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis,LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC,SVC
from sklearn.gaussian_process import GaussianProcessClassifier

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from pyfm import pylibfm

from sklearn.datasets import make_classification

# X, y = make_classification(n_samples=1000,n_features=100, n_clusters_per_class=1)
# data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]

# X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=42)

v = DictVectorizer()
x_train = v.fit_transform(x_train)
x_test = v.transform(x_test)

forest = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, 
                learning_rate_schedule="optimal")

forest.fit(x_train,y_train)

# forest = GaussianProcessClassifier(kernel=None, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, 
#                                     max_iter_predict=10, warm_start=False, copy_X_train=True, random_state=None, 
#                                    multi_class='one_vs_rest', n_jobs=1)


# cv_scores = []
# oof_preds = []

# # StratifiedKFold
# # kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2016)
# # for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):

# kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(x_train.shape[0])):
#         dev_X, val_X = x_train[dev_index,:], x_train[val_index,:]
#         dev_y, val_y = y_train[dev_index], y_train[val_index]
#         forest.fit(dev_X, dev_y)
#         forest_predict = forest.predict_proba(val_X)
#         cv_scores.append(log_loss(val_y, forest_predict))
# #         oof_preds.append(preds)
# #         predictions = bst.predict(X1cv)     
# #         predictions = preds.reshape( val_X.shape[0], 3)
#         print(cv_scores)
#         break



AttributeError: 'numpy.ndarray' object has no attribute 'iteritems'

In [256]:
# forest = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=10,
#                                 max_features= 0.2, min_samples_split = 20 ,bootstrap=False,n_jobs=-1, 
#                                 oob_score = False, random_state=3, verbose=1)
x_train = np.array(X_train)
x_test = np.array(X_test)

for i in range(x_train.shape[1]):
    x_test[:, i] = (x_test[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])
    x_train[:, i] = (x_train[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])

from sklearn.preprocessing import PolynomialFeatures    
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.log1p)    

poly = PolynomialFeatures(degree=2)

x_train = np.column_stack((x_train,poly.fit_transform(x_train),transformer.transform(x_train)))
x_test = np.column_stack((x_test,poly.fit_transform(x_test),transformer.transform(x_test)))

x_train[np.isnan(x_train)] = 0
x_test[np.isnan(x_test)] = 0

x_train[np.isinf(x_train)] = 0
x_test[np.isinf(x_test)] = 0

print x_train.shape
print x_test.shape
# from sklearn.ensemble import ExtraTreesClassifier,AdaBoostClassifier,IsolationForest
# from sklearn.linear_model import ARDRegression,LogisticRegression,BayesianRidge
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# from sklearn.metrics import log_loss

# # BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, 
# #               lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, 
# #               normalize=False, copy_X=True, verbose=False)

# forest = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.02, fit_intercept=True, 
#                             intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg', 
#                             max_iter=100, multi_class='multinomial', verbose=1, warm_start=False, n_jobs=1)

# cv_scores = []
# oof_preds = []
# # StratifiedKFold
# # kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2016)
# # for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):
# kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(x_train.shape[0])):
#         dev_X, val_X = x_train[dev_index,:], x_train[val_index,:]
#         dev_y, val_y = y_train[dev_index], y_train[val_index]
#         forest.fit(dev_X, dev_y)
#         forest_predict = forest.predict_proba(val_X)
#         cv_scores.append(log_loss(val_y, forest_predict))
# #         oof_preds.append(preds)
# #         predictions = bst.predict(X1cv)     
# #         predictions = preds.reshape( val_X.shape[0], 3)
#         print(cv_scores)
#         break


(49352, 15926)
(74659, 15926)


(49352, 16466)

In [163]:
#random forest

forest_predict = forest.predict_proba(X_val)
log_loss(y_val,forest_predict)
#benchmark 0.541

NameError: name 'X_dev' is not defined

In [93]:
#forest submission
forest_predict = forest.fit(X_train, y_train).predict_proba(X_test)
submission = pd.DataFrame(forest_predict, columns = ['high','medium','low']).set_index(test_df.index)
submission['listing_id'] = test_df.listing_id.values
submission.to_csv('model_1_forest.csv', index= False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  1.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    3.4s finished


In [40]:
st = pd.read_csv('stnet/test5.csv')

st.head()

Unnamed: 0,7142618.00000,1.00000,1.00000.1,2950.00000,1180.00000,0.32093,0.06863,0.33255,0.20816,8.00000,...,6.00000,5.00000,11.00000,5.00000.1,0.45833,0.33333,0.20833,0.13348,0.45643,0.41010
0,7210040.0,1.0,2.0,2850.0,814.28571,0.06051,0.02353,0.00411,0.0014,3.0,...,6.0,4.0,24.0,6.0,1.0,0.0,0.0,0.00887,0.02833,0.9628
1,7103890.0,1.0,1.0,3758.0,1503.2,0.27927,0.06854,0.24077,0.07628,6.0,...,6.0,4.0,3.0,4.0,0.0,1.0,0.0,0.05618,0.46727,0.47655
2,7143442.0,1.0,2.0,3300.0,942.85714,0.22797,0.07753,0.39354,0.35979,6.0,...,6.0,5.0,11.0,6.0,0.2459,0.39344,0.36066,0.15609,0.52356,0.32035
3,6860601.0,2.0,2.0,4900.0,1225.0,0.34767,0.05634,0.32074,0.0831,7.0,...,4.0,1.0,12.0,5.0,0.59722,0.31944,0.08333,0.07302,0.25761,0.66937
4,6840081.0,3.0,3.0,9000.0,1800.0,0.11321,0.03883,0.01119,0.0,8.0,...,4.0,3.0,7.0,6.0,0.98876,0.01124,0.0,0.00024,0.00796,0.9918


In [67]:
# test = pd.read_csv('stacknet/train_stacknet.csv')
X_train.head()

Unnamed: 0,bathrooms,bedrooms,price,num_priceXroom,building_id_mean_medium,building_id_mean_high,manager_id_mean_medium,manager_id_mean_high,num_photos,building_id_present,...,listing_id,median_price_groupby_manager,median_price_groupby_neighbourhood,month,day_of_week,day_of_month,hour_of_day,manager_level_low,manager_level_medium,manager_level_high
10,1.5,3,3000,631.578947,0.217575,0.0743694,0.283585,9.190504000000001e-29,5,1,...,7211212,2800.0,300.0,6,4,24,7,0.8,0.2,0.0
10000,1.0,2,5465,1561.428571,0.110642,1.761506e-07,0.012447,2.088626e-34,11,1,...,7150865,5445.0,-1290.0,6,6,12,12,0.985507,0.014493,0.0
100004,1.0,1,2850,1140.0,0.467347,0.09291841,0.357575,0.03169581,8,1,...,6887163,2862.5,775.0,4,6,17,3,0.560748,0.383178,0.056075
100007,1.0,1,3275,1310.0,0.094998,0.01187783,0.152256,0.08444808,3,1,...,6888711,3195.0,-200.0,4,0,18,2,0.777778,0.143791,0.078431
100013,1.0,4,3350,609.090909,0.058909,0.02270975,0.000564,0.000192214,3,0,...,6934781,5000.0,440.0,4,3,28,1,1.0,0.0,0.0


In [80]:
pred_tr = model.predict(xgb.DMatrix(X_train))
pred_ts = model.predict(xgb.DMatrix(X_test))

In [68]:
# new_df = X_train.copy()

# new_df['target'] = y_train

# new_df.head()

cols = list(new_df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('target')) #Remove b from list
# cols.pop(cols.index('x')) #Remove x from list
new_df = new_df[['target']+cols]
new_df.head()

Unnamed: 0,target,bathrooms,bedrooms,price,num_priceXroom,building_id_mean_medium,building_id_mean_high,manager_id_mean_medium,manager_id_mean_high,num_photos,...,listing_id,median_price_groupby_manager,median_price_groupby_neighbourhood,month,day_of_week,day_of_month,hour_of_day,manager_level_low,manager_level_medium,manager_level_high
10,1,1.5,3,3000,631.578947,0.217575,0.0743694,0.283585,9.190504000000001e-29,5,...,7211212,2800.0,300.0,6,4,24,7,0.8,0.2,0.0
10000,2,1.0,2,5465,1561.428571,0.110642,1.761506e-07,0.012447,2.088626e-34,11,...,7150865,5445.0,-1290.0,6,6,12,12,0.985507,0.014493,0.0
100004,0,1.0,1,2850,1140.0,0.467347,0.09291841,0.357575,0.03169581,8,...,6887163,2862.5,775.0,4,6,17,3,0.560748,0.383178,0.056075
100007,2,1.0,1,3275,1310.0,0.094998,0.01187783,0.152256,0.08444808,3,...,6888711,3195.0,-200.0,4,0,18,2,0.777778,0.143791,0.078431
100013,2,1.0,4,3350,609.090909,0.058909,0.02270975,0.000564,0.000192214,3,...,6934781,5000.0,440.0,4,3,28,1,1.0,0.0,0.0


In [71]:
ntest_df = X_test.copy()

ntest_df['id'] = test_df['listing_id']

ntest_df.head()

cols = list(ntest_df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('id')) #Remove b from list
# cols.pop(cols.index('x')) #Remove x from list
ntest_df = ntest_df[['id']+cols]
ntest_df.head()

Unnamed: 0,id,bathrooms,bedrooms,price,num_priceXroom,building_id_mean_medium,building_id_mean_high,manager_id_mean_medium,manager_id_mean_high,num_photos,...,listing_id,median_price_groupby_manager,median_price_groupby_neighbourhood,month,day_of_week,day_of_month,hour_of_day,manager_level_low,manager_level_medium,manager_level_high
0,7142618,1.0,1,2950,1180.0,0.320934,0.068633,0.332546,0.208158,8,...,7142618,2650.0,400.0,6,5,11,5,0.458333,0.333333,0.208333
1,7210040,1.0,2,2850,814.285714,0.060514,0.023531,0.004108,0.001402,3,...,7210040,2850.0,1100.0,6,4,24,6,1.0,0.0,0.0
100,7103890,1.0,1,3758,1503.2,0.279269,0.068537,0.24077,0.076282,6,...,7103890,4129.0,292.0,6,4,3,4,0.0,1.0,0.0
1000,7143442,1.0,2,3300,942.857143,0.227968,0.077526,0.393544,0.35979,6,...,7143442,3200.0,131.5,6,5,11,6,0.245902,0.393443,0.360656
100000,6860601,2.0,2,4900,1225.0,0.347673,0.05634,0.320742,0.083104,7,...,6860601,4200.0,-1305.0,4,1,12,5,0.597222,0.319444,0.083333


In [91]:
# new_df.to_csv('stnet/train.csv')
# ntest_df.to_csv('stnet/test.csv')
from scipy import sparse

train_file="stnet/train2.csv"
test_file="stnet/test2.csv"

# train_X = X_train.tocsr() 
# test_X = X_test.tocsr() 

train = np.array(new_df)
test = np.array(ntest_df) 

# X = np.column_stack((train,pred_tr))
#         # stack id to test
# X_ts = np.column_stack((test,pred_ts))        


print ("exporting files")
np.savetxt(train_file, train, delimiter=",", fmt='%.5f')
np.savetxt(test_file, test, delimiter=",", fmt='%.5f')        

# scipy.sparse.csr_matrix(df.values)
# You might need to take the transpose first, like df.values.T

exporting files


In [90]:
X.shape

(49352, 179)

In [None]:
print ("merging columns")   
        #stack xgboost predictions
X=np.column_stack((X,train_stacker))
        # stack id to test
X_test=np.column_stack((X_test,test_stacker))        
        
        # stack target to train
X=np.column_stack((y,X))
        # stack id to test
X_test=np.column_stack((ids,X_test))
        
        #export to txt files (, del.)


In [102]:
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold

__author__ = 'dpace'


class OOFCallback:
    def  __init__(self, oof_preds_dict, maximize=True):
        """
        :param dict oof_preds_dict: Should be an empty dict which can later be
            retrieved.
        :param bool maximize: If True, higher metric scores treated as better.
        """
        self.best_eval_metric = None
        self.oof_preds_dict = oof_preds_dict
        self.maximize=maximize

    def __call__(self, cbenv):
        current_val_score = cbenv.evaluation_result_list[1][1]
        if self.best_eval_metric is None:
            self.best_eval_metric = current_val_score
        if self.maximize:
            if current_val_score >= self.best_eval_metric:
                self.best_eval_metric = current_val_score
                self._compute_oof_preds(cbenv.cvfolds)
        elif current_val_score <= self.best_eval_metric:
                self.best_eval_metric = current_val_score
                self._compute_oof_preds(cbenv.cvfolds)

    def _compute_oof_preds(self, cvfolds):
        for i, fold in enumerate(cvfolds):
            self.oof_preds_dict[i] = fold.bst.predict(fold.dtest)

    

param = {}
param['objective'] = 'multi:softprob'
param['eta'] = .02
param['gamma'] = 0.0
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['reg_alpha'] = 2
param['min_child_weight'] = 10
param['subsample'] = 0.75
param['colsample_bytree'] = 0.5
param['seed'] = 3
param['nthread'] = 4
param['n_jobs'] = -1
# num_rounds = 5000    
    # xgb params

    # Data structure in which to save out-of-folds preds
oof_preds_dict = {}

    # Running cv
cv_res = xgb.cv(params=param,
                dtrain=xgtrain,
                num_boost_round=5000,
                folds = 10,
                early_stopping_rounds=50,
                verbose_eval=100,
                callbacks=[OOFCallback(oof_preds_dict)])

print("\nOut-of-folds predictions at best iteration: \n\n", oof_preds_dict)

KeyboardInterrupt: 

In [99]:
oof_preds_dict

AttributeError: 'dict' object has no attribute 'shape'

In [None]:
def append_meta_features():
# load StackNet input files
X = np.loadtxt("train_stacknet.csv", delimiter=",")`
X_test = np.loadtxt("test_stacknet.csv", delimiter=",")

# load results of one (or more) of your models - here also 
# out-of-fold predictions for the train set are needed
pred_test = np.loadtxt("meta_predictions/preds_test.txt")
pred_train = np.loadtxt("meta_predictions/preds_train.txt")

# combine them with the StackNet input
X_test = np.column_stack((X_test, pred_test))
X = np.column_stack((X, pred_train))

# save into new files to be fed into StackNet
np.savetxt("train_stacknet_with_meta.csv", X, delimiter=",", fmt='%.6f')
np.savetxt("test_stacknet_with_meta.csv", X_test, delimiter=",", fmt='%.6f')

In [None]:
#multiple xgboost models with different seeds

print("Start fitting...")

xgbclf=[]
rounds=10

for i in range(0,rounds):
    print("training %s"%(i))
    param['seed'] = randint(0,10000)
    num_rounds = 3700

    xgtrain = xgb.DMatrix(X_train, label=y_train)
    watchlist = [ (xgtrain,'train') ]
    plst= list(param.items())
    model = xgb.train(plst, xgtrain, num_rounds, watchlist)
    xgbclf.append(model)
    
print("Fitted")

Start fitting...
training 0
[0]	train-mlogloss:1.08598
[1]	train-mlogloss:1.07326
[2]	train-mlogloss:1.06097
[3]	train-mlogloss:1.04901
[4]	train-mlogloss:1.03742
[5]	train-mlogloss:1.0263
[6]	train-mlogloss:1.0155
[7]	train-mlogloss:1.00489
[8]	train-mlogloss:0.994561
[9]	train-mlogloss:0.984789
[10]	train-mlogloss:0.975207
[11]	train-mlogloss:0.965806
[12]	train-mlogloss:0.956604
[13]	train-mlogloss:0.947781
[14]	train-mlogloss:0.939169
[15]	train-mlogloss:0.930738
[16]	train-mlogloss:0.922704
[17]	train-mlogloss:0.914803
[18]	train-mlogloss:0.907279
[19]	train-mlogloss:0.899824
[20]	train-mlogloss:0.892694
[21]	train-mlogloss:0.885631
[22]	train-mlogloss:0.878687
[23]	train-mlogloss:0.871953
[24]	train-mlogloss:0.865478
[25]	train-mlogloss:0.859057
[26]	train-mlogloss:0.852902
[27]	train-mlogloss:0.846865
[28]	train-mlogloss:0.840968
[29]	train-mlogloss:0.835328
[30]	train-mlogloss:0.829831
[31]	train-mlogloss:0.824402
[32]	train-mlogloss:0.819231
[33]	train-mlogloss:0.813976
[34]	t

In [None]:
def prepare_submission():
    xgtest = xgb.DMatrix(X_test)
    for i in range(rounds):
        model=xgbclf[i]
        if i==0:
            preds = model.predict(xgtest)
        else:
            preds += model.predict(xgtest)
    preds /= rounds
    return preds

submit = prepare_submission()

In [None]:
submission = pd.DataFrame(submit, columns = ['high','medium','low']).set_index(test_df.index)
submission['listing_id'] = test_df.listing_id.values
submission.to_csv('rogue_one_10seed_submission.csv', index= False)
#0.5326 LB

In [100]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from collections import defaultdict
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from scipy.sparse import csr_matrix
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def main():
    

        #training and test files, created using SRK's python script
        train_file="train_stnet.csv"
        test_file="test_stnet.csv"
        
        ######### Load files ############

#         X,X_test,y,ids=load_data_sparse (data_path="input/")# you might need to change that to whatever folder the json files are in
#         ids= np.array([int(k)+68119576 for k in ids ]) # we add the id value we removed before for scaling reasons.
#         print(X.shape, X_test.shape) 
        
        #create to numpy arrays (dense format)        
        X    = np.array(X_train)
        X_ts = np.array(X_test)  
        
        print ("scalling") 
        #scale the data
        stda=StandardScaler()  
        X_test=stda.fit_transform (X_test)          
        X=stda.transform(X)

        
        CO=[0,14,21] # columns to create averages on
        
        #Create Arrays for meta
        train_stacker=[ [0.0 for s in range(3)]  for k in range (0,(X.shape[0])) ]
        test_stacker=[[0.0 for s in range(3)]   for k in range (0,(X_ts.shape[0]))]
        
        number_of_folds=5 # number of folds to use
        print("kfolder")
        #cerate 5 fold object
        mean_logloss = 0.0
        kfolder=StratifiedKFold(y_train, n_folds=number_of_folds,shuffle=True, random_state=15)   

        #xgboost_params
        param = {}
        param['objective'] = 'multi:softprob'
        param['eta'] = .02
        param['gamma'] = 0.0
        param['max_depth'] = 6
        param['silent'] = 1
        param['num_class'] = 3
        param['eval_metric'] = "mlogloss"
        param['reg_alpha'] = 2
        param['min_child_weight'] = 10
        param['subsample'] = 0.75
        param['colsample_bytree'] = 0.5
        param['seed'] = 3
        param['nthread'] = 4
        param['n_jobs'] = -1
# num_rounds = 5000 

        
        i=0 # iterator counter
        print ("starting cross validation with %d kfolds " % (number_of_folds))
        for train_index, test_index in kfolder:
                # creaning and validation sets
                X_train, X_cv = X[train_index], X[test_index]
                y_trn, y_cv = np.array(y_train)[train_index], np.array(y_train)[test_index]
                #create past averages for some fetaures
#                 W_train,W_cv=convert_to_avg(X_train,y_tr, X_cv, seed=1, cvals=5, roundings=2, columns=CO)
                W_train = X_train
                W_cv = X_cv 
                print (" train size: %d. test size: %d, cols: %d " % ((W_train.shape[0]) ,(W_cv.shape[0]) ,(W_train.shape[1]) ))
                #training
                X1=xgb.DMatrix(csr_matrix(W_train), label=np.array(y_tr),missing =-999.0)
                X1cv=xgb.DMatrix(csr_matrix(W_cv), missing =-999.0)
                bst = xgb.train(param.items(), X1, 1000) 
                #predictions
                predictions = bst.predict(X1cv)     
                preds=predictions.reshape( W_cv.shape[0], 3)

                #scalepreds(preds)     
                logs = log_loss(y_cv,preds)
                print "size train: %d size cv: %d loglikelihood (fold %d/%d): %f" % ((W_train.shape[0]), (W_cv.shape[0]), i + 1, number_of_folds, logs)
             
                mean_logloss += logs
                #save the results
                no=0
                for real_index in test_index:
                    for d in range (0,3):
                        train_stacker[real_index][d]=(preds[no][d])
                    no+=1
                i+=1
        mean_logloss/=number_of_folds
        print (" Average Lolikelihood: %f" % (mean_logloss) )
                
        #calculating averages for the train data
#         W,W_test=convert_to_avg(X,y, X_test, seed=1, cvals=5, roundings=2, columns=CO)
        W= X
        W_test= X_ts          
        #X_test=np.column_stack((X_test,woe_cv))      
        print (" making test predictions ")
        
        X1=xgb.DMatrix(csr_matrix(W), label=np.array(y_train) , missing =-999.0)
        X1cv=xgb.DMatrix(csr_matrix(W_test), missing =-999.0)
        bst = xgb.train(param.items(), X1, 1000) 
        predictions = bst.predict(X1cv)     
        preds=predictions.reshape( W_test.shape[0], 3)        
       
        for pr in range (0,len(preds)):  
                for d in range (0,3):            
                    test_stacker[pr][d]=(preds[pr][d]) 
        
        
        
        print ("merging columns")   
        #stack xgboost predictions
        X=np.column_stack((X,train_stacker))
        # stack id to test
        X_ts=np.column_stack((X_ts,test_stacker))        
        
#         # stack target to train
#         X=np.column_stack((y_train,X))
#         # stack id to test
#         X_test=np.column_stack((ids,X_test))
        
#         #export to txt files (, del.)
#         print ("exporting files")
#         np.savetxt(train_file, X, delimiter=",", fmt='%.5f')
#         np.savetxt(test_file, X_test, delimiter=",", fmt='%.5f')        

#         print("Write results...")
#         output_file = "submission_"+str( (mean_logloss ))+".csv"
#         print("Writing submission to %s" % output_file)
#         f = open(output_file, "w")   
#         f.write("listing_id,high,medium,low\n")# the header   
#         for g in range(0, len(test_stacker))  :
#           f.write("%s" % (ids[g]))
#           for prediction in test_stacker[g]:
#              f.write(",%f" % (prediction))    
#           f.write("\n")
#         f.close()
#         print("Done.")
                             
                  

       



if __name__=="__main__":
  main()


UnboundLocalError: local variable 'X_train' referenced before assignment

In [273]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# x_train = np.array(X_train)
# x_test = np.array(X_test)

X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.fillna(0)

X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)

x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

print x_train.shape
print x_test.shape

(49352, 175)
(74659, 175)


In [278]:
from sklearn.manifold import TSNE,MDS

dim = TSNE(n_components= 10, perplexity=30.0, early_exaggeration=4.0, learning_rate=1000.0, n_iter=1000, 
                      n_iter_without_progress=30, min_grad_norm=1e-07, metric='euclidean', init='random', verbose=0, 
                      random_state=None, method='barnes_hut', angle=0.5)


In [282]:
# tr_tsne = dim.fit_transform(x_train)

test_tsne = dim.fit_transform(x_test)

KeyboardInterrupt: 

In [None]:
mds = MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=1, random_state=None,
          dissimilarity='euclidean')