In [1]:
# This file is aimed to predict in one week which day is the most busy and which day order number is the least.
# Accordingly, store supervisor would like to arrange schedule more reasonable according to this prediction.
# Algorithm: Random Forest

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("orders.csv")
data.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
# check if there are NaN in dataframe
data.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [5]:
# We can see threre are 206209 values in days_since_prior_order cloumn is NaN 
# replace all the NaN with 0
data =data.fillna(0)
data.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0.0
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
# Drop the data which we don't need
features=data.drop(['order_id','order_number'], axis=1)
features.head()

Unnamed: 0,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order
0,1,prior,2,8,0.0
1,1,prior,3,7,15.0
2,1,prior,3,12,21.0
3,1,prior,4,7,29.0
4,1,prior,4,15,28.0


In [7]:
# Get test rows from the original data
# This function is aimed to prepare data for random forest
def dataPrepare(rowVal):
    prepare = features.loc[features['eval_set']==rowVal]
    prepare.to_csv('order_'+rowVal+'.csv', index=False)


In [8]:
# Run function with speicific parameters--'prior','test','train'
dataPrepare('prior')
dataPrepare('test')
dataPrepare('train')

In [9]:
# 1st step to prepare data for x_train & x_test
x_feature = pd.read_csv('order_prior.csv')
x_feature.head()

Unnamed: 0,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order
0,1,prior,2,8,0.0
1,1,prior,3,7,15.0
2,1,prior,3,12,21.0
3,1,prior,4,7,29.0
4,1,prior,4,15,28.0


In [10]:
# choose the cloumns for x_train & x_test
feature_x = x_feature.drop(['eval_set'], axis=1)
feature_x.head()

Unnamed: 0,user_id,order_dow,order_hour_of_day,days_since_prior_order
0,1,2,8,0.0
1,1,3,7,15.0
2,1,3,12,21.0
3,1,4,7,29.0
4,1,4,15,28.0


In [11]:
# Prepare data for y_train
train_target = pd.read_csv('order_train.csv')
target_train = train_target.drop(['eval_set'], axis=1)
train_target.head()


Unnamed: 0,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order
0,1,train,4,8,14.0
1,2,train,1,11,30.0
2,5,train,0,11,6.0
3,7,train,2,11,6.0
4,8,train,1,14,10.0


In [12]:
target_train.head()

Unnamed: 0,user_id,order_dow,order_hour_of_day,days_since_prior_order
0,1,4,8,14.0
1,2,1,11,30.0
2,5,0,11,6.0
3,7,2,11,6.0
4,8,1,14,10.0


In [13]:
# Prepare data for y_test
target_test = pd.read_csv('order_test.csv')
test_target = target_test.drop(['eval_set'], axis=1)
test_target.head()

Unnamed: 0,user_id,order_dow,order_hour_of_day,days_since_prior_order
0,3,5,15,11.0
1,4,3,12,30.0
2,6,3,16,22.0
3,11,6,11,8.0
4,12,1,20,30.0


In [18]:
 # 2nd step to prepare data for x_train & x_test
 # Split different prior data(with different tail)


total_feature = feature_x
split_id_list=[]
for val in target_train['user_id']:
    split_id_list.append(val)
    
split_id_list

[1,
 2,
 5,
 7,
 8,
 9,
 10,
 13,
 14,
 17,
 18,
 21,
 23,
 24,
 27,
 29,
 30,
 34,
 37,
 38,
 41,
 42,
 43,
 44,
 46,
 47,
 48,
 49,
 50,
 52,
 53,
 55,
 56,
 59,
 62,
 63,
 64,
 65,
 66,
 67,
 70,
 71,
 72,
 74,
 76,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 86,
 87,
 88,
 89,
 90,
 91,
 93,
 95,
 96,
 97,
 99,
 102,
 103,
 104,
 105,
 106,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 123,
 124,
 125,
 128,
 130,
 131,
 132,
 135,
 138,
 139,
 140,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 150,
 155,
 157,
 160,
 161,
 163,
 164,
 167,
 169,
 171,
 173,
 174,
 175,
 176,
 178,
 179,
 183,
 184,
 185,
 187,
 189,
 191,
 192,
 193,
 195,
 197,
 198,
 199,
 201,
 203,
 204,
 205,
 206,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 217,
 218,
 219,
 220,
 222,
 223,
 224,
 226,
 229,
 230,
 231,
 235,
 238,
 239,
 240,
 242,
 243,
 245,
 249,
 250,
 252,
 253,
 255,
 256,
 258,
 259,
 260,
 261,
 262,
 264,
 267,
 270,
 271,
 272,
 273,
 278,
 280,
 

In [29]:
# Split the prior order as x_train 
# Save data as csv file
x_train_prior=feature_x[feature_x.user_id.isin(split_id_list)]
x_train_prior.to_csv('order_prior_for_train.csv',index=False)
x_train_prior

Unnamed: 0,user_id,order_dow,order_hour_of_day,days_since_prior_order
0,1,2,8,0.0
1,1,3,7,15.0
2,1,3,12,21.0
3,1,4,7,29.0
4,1,4,15,28.0
5,1,2,7,19.0
6,1,1,9,20.0
7,1,1,14,14.0
8,1,1,16,0.0
9,1,4,8,30.0


In [26]:
# feature_x = total_feature

In [28]:
# Split the prior order as x_test 
# Save data as csv file
x_test_prior = feature_x[~feature_x.user_id.isin(split_id_list)]
x_test_prior.to_csv('order_prior_for_test.csv',index=False)
x_test_prior

Unnamed: 0,user_id,order_dow,order_hour_of_day,days_since_prior_order
24,3,1,14,0.0
25,3,3,19,9.0
26,3,3,16,21.0
27,3,2,18,20.0
28,3,0,17,12.0
29,3,0,16,7.0
30,3,0,15,7.0
31,3,0,17,7.0
32,3,0,16,7.0
33,3,3,16,17.0


In [15]:
# dataframe useage explanation :
# for algorithm training: (x_train_prior, target_train)
# for algorithm test: (x_test_prior, test_target)

#  correspondence between the csv file and dataframe variables(include the final prepared variable for our needs):
#    file name                          dataframe variables name      variable we need to use
#  order_prior_for_train.csv               x_train_prior                    x_train_prior
#  order_prior_for_test.csv                x_test_prior                     x_test_prior
#  order_train.csv                         train_target                     target_train
#  order_test.csv                          target_test                      test_target

