In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from catboost import CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
meal = pd.read_csv('meal_info.csv')
center = pd.read_csv('fulfilment_center_info.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
train = train.merge(meal, on='meal_id', how='left')
train = train.merge(center, on='center_id', how='left')

In [4]:
test['num_orders'] = 0
test = test.merge(meal, on='meal_id', how='left')
test = test.merge(center, on='center_id', how='left')

In [5]:
train.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0


In [6]:
train['center_meal_id'] = train['center_id'].map(str) + '_' + train['meal_id'].map(str)
test['center_meal_id'] = test['center_id'].map(str) + '_' + test['meal_id'].map(str)

In [7]:
train_test = pd.concat([train, test])

In [8]:
train_test['base-checkout'] = train_test['base_price'] - train_test['checkout_price']
train_test['checkout_by_base'] = train_test['checkout_price'] / train_test['base_price']
train_test['diff_by_base'] = train_test['base-checkout'] / train_test['base_price']

In [9]:
num_predictors = ['week', 'checkout_price', 'base_price', 'op_area', 'base-checkout', 'checkout_by_base', 'diff_by_base']
cat_predictors = ['center_id', 'meal_id', 'emailer_for_promotion', 'homepage_featured', 'category', 'cuisine', 'city_code', 'region_code', 'center_type', 'center_meal_id']
predictors = cat_predictors + num_predictors 

In [10]:
train = train_test[:train.shape[0]]
test = train_test[train.shape[0]:]

In [11]:
df_train = train#[predictors]#.values
Y = np.log(train['num_orders'])#.values
center_meal_id = train['center_meal_id'].values
df_test = test#[predictors]#.values

In [12]:
for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
    temp = pd.DataFrame()
    temp[[cat_column, 'mean_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].mean().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'mean_' + cat_column not in predictors:
        predictors.append('mean_' + cat_column)

    temp = pd.DataFrame()
    temp[[cat_column, 'median_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].median().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'median_' + cat_column not in predictors:
        predictors.append('median_' + cat_column)

    temp = pd.DataFrame()
    temp[[cat_column, 'std_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].std().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'std_' + cat_column not in predictors:
        predictors.append('std_' + cat_column)

In [13]:
df_test.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,...,std_category,mean_city_code,median_city_code,std_city_code,mean_region_code,median_region_code,std_region_code,mean_center_type,median_center_type,std_center_type
0,1028232,146,55,1885,158.11,159.11,0,0,0,Beverages,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
1,1127204,146,55,1993,160.11,159.11,0,0,0,Beverages,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
2,1212707,146,55,2539,157.14,159.14,0,0,0,Beverages,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
3,1082698,146,55,2631,162.02,162.02,0,0,0,Beverages,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
4,1400926,146,55,1248,163.93,163.93,0,0,0,Beverages,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161


In [15]:
cat_preds = np.zeros(df_test.shape[0])
#rf_preds = np.zeros(df_test.shape[0])
#xgb_preds = np.zeros(df_test.shape[0])
oof_train_cat = pd.DataFrame({'id': train['id'].values, 'num_orders':train['num_orders'].values})
clf = 0
seeds = [69]
for seed in seeds:
    df_train = train#[predictors]#.values
    Y = np.log(train['num_orders'])#.values
    center_meal_id = train['center_meal_id'].values
    #df_test = test#[predictors]#.values
    
    best=[]
    score = []

    kfolds = 10
    nrounds = 10000
    skf = StratifiedKFold(n_splits=kfolds, random_state=123, shuffle=True)
    i=0

    #clf = CatBoostRegressor(learning_rate=0.1, iterations=1500, random_seed=seed, depth=5, eval_metric='RMSE', verbose=True)
    #rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=69, verbose=True, min_samples_leaf=10)
    print('seed:', seed)
    for train_index, test_index in skf.split(df_train, center_meal_id):
        print('Fold {0}'.format(i + 1))
        break
        X_train, X_val = df_train.loc[train_index], df_train.loc[test_index]
        y_train, y_val = Y.loc[train_index], Y.loc[test_index]

        #rf.fit(X_train[predictors], y_train)
        
        for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
            temp = pd.DataFrame()
            temp[[cat_column, 'mean_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].mean().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'mean_' + cat_column not in predictors:
                predictors.append('mean_' + cat_column)
                
            temp = pd.DataFrame()
            temp[[cat_column, 'median_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].median().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'median_' + cat_column not in predictors:
                predictors.append('median_' + cat_column)
                
            temp = pd.DataFrame()
            temp[[cat_column, 'std_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].std().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'std_' + cat_column not in predictors:
                predictors.append('std_' + cat_column)
        
                
        #print X_train.head()
        #break
        
        X_train.fillna(0, inplace=True)
        X_val.fillna(0, inplace=True)
        
        #X_val = X_val.values
        #y_val = y_val.values
        
        clf = CatBoostRegressor(learning_rate=0.1, iterations=2000, random_seed=seed, depth=5, eval_metric='RMSE', verbose=100)
        clf.fit(X_train[predictors], y_train, eval_set=(X_val[predictors], y_val), use_best_model=True, cat_features=list(range(len(cat_predictors))), verbose=100, early_stopping_rounds=50)
        #print 'best iteration:', clf.tree_count_
        best.append(clf.tree_count_)
        
        train_pool = Pool(X_train[predictors], y_train, cat_features=list(range(len(cat_predictors))))
        feature_importances = clf.get_feature_importance(train_pool)
        feature_names = predictors
        for scoree, name in sorted(zip(feature_importances, feature_names), reverse=True):
            print('{}: {}'.format(name, scoree))
        #X_train = pd.concat([X_train, pos_rows[predictors]])
        #y_train = pd.concat([y_train, pos_rows['Purchase']])

        #idx = np.arange(len(X_train))
        #np.random.shuffle(idx)
        #X_train = X_train.iloc[idx]
        #y_train = y_train.iloc[idx]

        #dtrain = xgb.DMatrix(X_train,y_train)
        #dval = xgb.DMatrix(X_val,y_val)
        #watchlist = [(dtrain, 'train'), (dval, 'eval')]

        #gbdt = xgb.train(xgb_params, dtrain, nrounds, watchlist, verbose_eval=1, early_stopping_rounds=50)  
        #bst=gbdt.best_ntree_limit
        #pred = gbdt.predict(dval)
        pred = clf.predict(X_val[predictors])
    
        cat_preds += clf.predict(df_test[predictors])
        #pred=rf.predict_proba(X_val[predictors])[:,1]

        #preds += gbdt.predict(xgb.DMatrix(df_test), ntree_limit=bst)/5.0
        #print pred

        oof_train_cat.loc[test_index, 'num_orders'] += pred

        scr = np.sqrt(mean_squared_error(list(y_val), list(pred))) 
        print('score: ', scr)

        #best.append(bst)
        score.append(scr)

        i+=1
       
    #print('Mean RMSE for seed: ', seed, np.mean(score))
    
    for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
        temp = pd.DataFrame()
        temp[[cat_column, 'mean_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].mean().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'mean_' + cat_column not in predictors:
            predictors.append('mean_' + cat_column)

        temp = pd.DataFrame()
        temp[[cat_column, 'median_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].median().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'median_' + cat_column not in predictors:
            predictors.append('median_' + cat_column)

        temp = pd.DataFrame()
        temp[[cat_column, 'std_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].std().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'std_' + cat_column not in predictors:
            predictors.append('std_' + cat_column)
            
    #print('mean number of best iterations:', int(np.average(best)))
    clf = CatBoostRegressor(learning_rate=0.1, iterations=4000, random_seed=seed, depth=5, eval_metric='RMSE', verbose=100)
    clf.fit(df_train[predictors], Y, cat_features=list(range(len(cat_predictors))))
    cat_preds += clf.predict(df_test[predictors])
    #cat_preds /= (1.0*kfolds)
    
    #rf.fit(df_train[predictors], Y)
    #rf_preds += rf.predict_proba(df_test[predictors])[:,1]
    
    
submit=pd.DataFrame()
submit['id'] = test['id'].values
submit['num_orders'] = cat_preds/(1.0*len(seeds))
submit['num_orders'] = np.exp(submit['num_orders'])
submit.to_csv('cat_submit.csv', index=False)
    
oof_train_cat['num_orders'] /= (1.0*len(seeds))
oof_train_cat['num_orders'] = np.exp(clf.predict(df_train[predictors]))
oof_train_cat.to_csv('oof_train_cat.csv', index=False)

seed: 69




Fold 1
0:	learn: 4.5294763	total: 826ms	remaining: 55m 4s
100:	learn: 0.5100533	total: 1m	remaining: 38m 55s
200:	learn: 0.4966598	total: 2m	remaining: 37m 50s
300:	learn: 0.4888940	total: 2m 58s	remaining: 36m 34s
400:	learn: 0.4839423	total: 3m 55s	remaining: 35m 16s
500:	learn: 0.4799417	total: 4m 54s	remaining: 34m 14s
600:	learn: 0.4767263	total: 5m 51s	remaining: 33m 7s
700:	learn: 0.4741379	total: 6m 48s	remaining: 32m 3s
800:	learn: 0.4720718	total: 7m 46s	remaining: 31m 2s
900:	learn: 0.4701431	total: 8m 43s	remaining: 30m 1s
1000:	learn: 0.4683852	total: 9m 40s	remaining: 28m 59s
1100:	learn: 0.4666915	total: 10m 41s	remaining: 28m 10s
1200:	learn: 0.4653332	total: 12m 7s	remaining: 28m 15s
1300:	learn: 0.4639886	total: 13m 33s	remaining: 28m 7s
1400:	learn: 0.4625919	total: 15m	remaining: 27m 51s
1500:	learn: 0.4615731	total: 16m 26s	remaining: 27m 23s
1600:	learn: 0.4604155	total: 17m 58s	remaining: 26m 55s
1700:	learn: 0.4594373	total: 19m 27s	remaining: 26m 17s
1800:	lear