In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from catboost import CatBoostRegressor, Pool
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
meal = pd.read_csv('meal_info.csv')
center = pd.read_csv('fulfilment_center_info.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
train = train.merge(meal, on='meal_id', how='left')
train = train.merge(center, on='center_id', how='left')

In [4]:
test['num_orders'] = 0
test = test.merge(meal, on='meal_id', how='left')
test = test.merge(center, on='center_id', how='left')

In [5]:
train.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0


In [6]:
train['center_meal_id'] = train['center_id'].map(str) + '_' + train['meal_id'].map(str)
test['center_meal_id'] = test['center_id'].map(str) + '_' + test['meal_id'].map(str)

In [7]:
train_test = pd.concat([train, test])

In [8]:
train_test['base-checkout'] = train_test['base_price'] - train_test['checkout_price']
train_test['checkout_by_base'] = train_test['checkout_price'] / train_test['base_price']
train_test['diff_by_base'] = train_test['base-checkout'] / train_test['base_price']

In [9]:
train_test.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area,center_meal_id,base-checkout,checkout_by_base,diff_by_base
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0,55_1885,15.46,0.898483,0.101517
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0,55_1993,-1.0,1.007362,-0.007362
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0,55_2539,1.0,0.992639,0.007361
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0,55_2139,98.03,0.775947,0.224053
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0,55_2631,-1.0,1.004124,-0.004124


In [10]:
for c in ['category', 'cuisine', 'center_type', 'center_meal_id']:
    le = LabelEncoder()
    train_test[c] = le.fit_transform(train_test[c])

In [11]:
num_predictors = ['week', 'checkout_price', 'base_price', 'op_area', 'base-checkout', 'checkout_by_base', 'diff_by_base']
cat_predictors = ['center_id', 'meal_id', 'emailer_for_promotion', 'homepage_featured', 'category', 'cuisine', 'city_code', 'region_code', 'center_type', 'center_meal_id']
predictors = cat_predictors + num_predictors 

In [12]:
train = train_test[:train.shape[0]]
test = train_test[train.shape[0]:]

In [13]:
df_train = train#[predictors]#.values
Y = np.log(train['num_orders'])#.values
center_meal_id = train['center_meal_id'].values
df_test = test#[predictors]#.values

In [14]:
for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
    temp = pd.DataFrame()
    temp[[cat_column, 'mean_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].mean().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'mean_' + cat_column not in predictors:
        predictors.append('mean_' + cat_column)

    temp = pd.DataFrame()
    temp[[cat_column, 'median_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].median().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'median_' + cat_column not in predictors:
        predictors.append('median_' + cat_column)

    temp = pd.DataFrame()
    temp[[cat_column, 'std_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].std().reset_index()
    df_test = df_test.merge(temp, on=cat_column, how='left')
    if 'std_' + cat_column not in predictors:
        predictors.append('std_' + cat_column)

In [15]:
df_test.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,...,std_category,mean_city_code,median_city_code,std_city_code,mean_region_code,median_region_code,std_region_code,mean_center_type,median_center_type,std_center_type
0,1028232,146,55,1885,158.11,159.11,0,0,0,0,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
1,1127204,146,55,1993,160.11,159.11,0,0,0,0,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
2,1212707,146,55,2539,157.14,159.14,0,0,0,0,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
3,1082698,146,55,2631,162.02,162.02,0,0,0,0,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161
4,1400926,146,55,1248,163.93,163.93,0,0,0,0,...,375.280914,276.243387,123,449.373759,316.481849,161,477.08328,206.670097,107,326.7161


In [16]:
xgb_params = {
    #'tree_method':'approx',
    'seed': 69, 
    #'colsample_bytree': 1, #1
    'silent': 1,
    'subsample': 0.7, #0.8
    'learning_rate': 0.1, #0.01
    'objective': 'reg:linear',
    'max_depth': 5, #6
    #'min_child_weight': 9, #9
    #'gamma': 0.001, 
    #'alpha': 0.001,
    #'num_class': 3,
    'eval_metric' : 'rmse'
}
skf = 0

In [19]:
#%%time
#cat_preds = np.zeros(df_test.shape[0])
#rf_preds = np.zeros(df_test.shape[0])
xgb_preds = np.zeros(df_test.shape[0])
gbdt = 0
oof_train_xgb = pd.DataFrame({'id': df_train['id'].values, 'num_orders':df_train['num_orders'].values})
#cols = train_test.columns[1:-1]
for seed in [69]:
    best=[]
    score=[]

    preds = np.zeros(test.shape[0])

    kfolds = 10
    nrounds = 3000
    skf = StratifiedKFold(n_splits=kfolds, random_state=123, shuffle=True)
    i=0

    #clf = CatBoostClassifier(learning_rate=0.1, iterations=100, random_seed=seed, depth=4, eval_metric='AUC', verbose=True, class_weights=[1, 2])
    #rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=69, verbose=True, min_samples_leaf=10)
    
    for train_index, test_index in skf.split(df_train, center_meal_id):
        print('Fold {0}'.format(i + 1))
        break
        X_train, X_val = df_train.loc[train_index], df_train.loc[test_index]
        y_train, y_val = Y[train_index], Y[test_index]
        
        for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
            temp = pd.DataFrame()
            temp[[cat_column, 'mean_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].mean().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'mean_' + cat_column not in predictors:
                predictors.append('mean_' + cat_column)
                
            temp = pd.DataFrame()
            temp[[cat_column, 'median_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].median().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'median_' + cat_column not in predictors:
                predictors.append('median_' + cat_column)
                
            temp = pd.DataFrame()
            temp[[cat_column, 'std_' + cat_column]] = X_train.groupby(cat_column)['num_orders'].std().reset_index()
            X_train = X_train.merge(temp, on=cat_column, how='left')
            X_val = X_val.merge(temp, on=cat_column, how='left')
            if 'std_' + cat_column not in predictors:
                predictors.append('std_' + cat_column)

        #clf.fit(X_train[predictors], y_train, cat_features=range(0,X_train.shape[1]-2))
        #rf.fit(X_train[predictors], y_train)
        '''
        for column in cols:
            temp = pd.DataFrame()
            temp[[column, 'mean_' + column]] = X_train.groupby(column)['Purchase'].sum().reset_index()
            X_train = X_train.merge(temp, on=column, how='left')
            X_val = X_val.merge(temp, on=column, how='left')
            X_train.drop(column, inplace=True, axis=1)
            X_val.drop(column, inplace=True, axis=1)
        
        X_train.drop(['Purchase', 'ID'], inplace=True, axis=1)
        X_val.drop(['Purchase', 'ID'], inplace=True, axis=1)'''
            
        #print X_train.head()
        #break

        #X_train = pd.concat([X_train, pos_rows[predictors]])
        #y_train = pd.concat([y_train, pos_rows['Purchase']])

        #idx = np.arange(len(X_train))
        #np.random.shuffle(idx)
        #X_train = X_train.iloc[idx]
        #y_train = y_train.iloc[idx]

        dtrain = xgb.DMatrix(X_train[predictors].values, y_train)
        dval = xgb.DMatrix(X_val[predictors].values, y_val)
        watchlist = [(dtrain, 'train'), (dval, 'eval')]

        gbdt = xgb.train(xgb_params, dtrain, nrounds, watchlist, verbose_eval=100, early_stopping_rounds=50)  
        bst = gbdt.best_ntree_limit
        pred = gbdt.predict(dval, ntree_limit=bst)
        #pred = [1. if y > 0.5 else 0. for y in pred]
        #pred=clf.predict_proba(X_val[predictors])[:,1]
        #pred=rf.predict_proba(X_val[predictors])[:,1]

        #preds += gbdt.predict(xgb.DMatrix(df_test), ntree_limit=bst)/5.0
        #print('Pred:', pred)

        oof_train_xgb.loc[test_index, 'num_orders'] += pred

        scr = np.sqrt(mean_squared_error(list(y_val), list(pred)))
        print('score: ', scr)

        best.append(bst)
        score.append(scr)
        break

        i+=1
    #print('seed: ', seed, np.mean(score))
    #break
    #clf.fit(df_train[predictors], Y, cat_features=range(0,df_train.shape[1]-2))
    #cat_preds += clf.predict_proba(df_test[predictors])[:,1]
    
    #rf.fit(df_train[predictors], Y)
    #rf_preds += rf.predict_proba(df_test[predictors])[:,1]
    #best_nrounds = int(round(np.mean(best)))
    for cat_column in ['center_meal_id', 'center_id', 'meal_id', 'cuisine', 'category', 'city_code', 'region_code', 'center_type']:
        temp = pd.DataFrame()
        temp[[cat_column, 'mean_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].mean().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'mean_' + cat_column not in predictors:
            predictors.append('mean_' + cat_column)

        temp = pd.DataFrame()
        temp[[cat_column, 'median_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].median().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'median_' + cat_column not in predictors:
            predictors.append('median_' + cat_column)

        temp = pd.DataFrame()
        temp[[cat_column, 'std_' + cat_column]] = df_train.groupby(cat_column)['num_orders'].std().reset_index()
        df_train = df_train.merge(temp, on=cat_column, how='left')
        if 'std_' + cat_column not in predictors:
            predictors.append('std_' + cat_column)
    
    X_train = xgb.DMatrix(df_train[predictors], Y)
    X_test = xgb.DMatrix(df_test[predictors])
    

    gbdt = xgb.train(xgb_params, X_train, 4000, verbose_eval=100)
    xgb_preds += gbdt.predict(X_test)
    oof_train_xgb['num_orders'] += gbdt.predict(xgb.DMatrix(df_train[predictors]))
    
submit=pd.DataFrame()
submit['id'] = test['id'].values
submit['num_orders'] = xgb_preds/(1.0*len(seeds))
submit['num_orders'] = np.exp(submit['num_orders'])
submit.to_csv('xgb_submit.csv', index=False)
    
oof_train_xgb['num_orders'] /= (1.0*len(seeds))
oof_train_xgb['num_orders'] = np.exp(oof_train_xgb['num_orders'])
oof_train_xgb.to_csv('oof_train_xgb.csv', index=False)



Fold 1


NameError: name 'seeds' is not defined

In [21]:
submit=pd.DataFrame()
submit['id'] = test['id'].values
submit['num_orders'] = xgb_preds
submit['num_orders'] = np.exp(submit['num_orders'])
submit.to_csv('xgb_submit.csv', index=False)