In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor

In [2]:
train = pd.read_csv('train.csv', parse_dates=['application_date'],index_col='application_date')
test = pd.read_csv('test.csv', parse_dates=['application_date'],index_col='application_date')
sample = pd.read_csv('sample.csv')


In [3]:
train.head()

Unnamed: 0_level_0,segment,branch_id,state,zone,case_count
application_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-04-01,1,1.0,WEST BENGAL,EAST,40.0
2017-04-03,1,1.0,WEST BENGAL,EAST,5.0
2017-04-04,1,1.0,WEST BENGAL,EAST,4.0
2017-04-05,1,1.0,WEST BENGAL,EAST,113.0
2017-04-07,1,1.0,WEST BENGAL,EAST,76.0


In [4]:
test.head()

Unnamed: 0_level_0,id,segment
application_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-06,1,1
2019-07-07,2,1
2019-07-08,3,1
2019-07-09,4,1
2019-07-10,5,1


In [5]:
train.drop(['branch_id', 'state', 'zone'], axis=1, inplace=True)


In [6]:
train.head()

Unnamed: 0_level_0,segment,case_count
application_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-01,1,40.0
2017-04-03,1,5.0
2017-04-04,1,4.0
2017-04-05,1,113.0
2017-04-07,1,76.0


In [7]:
train_1 = train[train.segment == 1]
train_2 = train[train.segment == 2]

In [8]:
train_1 = train_1.groupby(['application_date']).agg({'case_count':sum})

In [9]:
train_2 = train_2.groupby(['application_date']).agg({'case_count':sum})

In [10]:
 def get_time(buf):
    buf['dayofmonth'] = buf.index.day
    buf['dayofweek'] = buf.index.weekday
    buf['weekend'] = (buf.index.dayofweek>4).astype(int)#Satday,Sunday
    buf['is_month_start'] = (buf.index.is_month_start).astype(int)
    buf['is_month_end'] = (buf.index.is_month_end).astype(int)
    return buf

In [11]:
train_1 = get_time(train_1)
train_2 = get_time(train_2)
test = get_time(test)

In [12]:
train_1 = train_1[train_1.index > '2018-12-31']
train_2 = train_2[train_2.index > '2018-12-31']

In [13]:
y1 = train_1.case_count
y2 = train_2.case_count

In [14]:
del train_1['case_count'], train_2['case_count']

In [15]:
test_1 = test[test.segment == 1]
test_2 = test[test.segment == 2]

In [16]:
del test_1['id'], test_2['id'], test_1['segment'], test_2['segment']

In [17]:
train_1.head()

Unnamed: 0_level_0,dayofmonth,dayofweek,weekend,is_month_start,is_month_end
application_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01,1,1,0,1,0
2019-01-02,2,2,0,0,0
2019-01-03,3,3,0,0,0
2019-01-04,4,4,0,0,0
2019-01-05,5,5,1,0,0


In [18]:
holidays = pd.read_csv('2019.csv')

holidays.drop(['day', 'holiday', 'holiday_type'], axis=1, inplace=True)

holidays.rename(columns={'date':'application_date'}, inplace=True)
holidays['is_holiday'] = 1

In [19]:
holidays.head()

Unnamed: 0,application_date,is_holiday
0,2019-01-01,1
1,2019-01-13,1
2,2019-01-14,1
3,2019-01-15,1
4,2019-01-26,1


In [20]:
holidays['application_date'] = pd.to_datetime(holidays['application_date'])

In [21]:
def add_holiday_data(dff):
    dff.reset_index(inplace=True)
    dff = pd.merge(dff, holidays, on='application_date', how='left')
    dff.drop('application_date', axis=1, inplace=True)
    return dff

In [22]:
train_1 = add_holiday_data(train_1) 
train_2 = add_holiday_data(train_2) 
test_1 = add_holiday_data(test_1) 
test_2 = add_holiday_data(test_2)

In [23]:
train_1.head()

Unnamed: 0,dayofmonth,dayofweek,weekend,is_month_start,is_month_end,is_holiday
0,1,1,0,1,0,1.0
1,2,2,0,0,0,
2,3,3,0,0,0,
3,4,4,0,0,0,
4,5,5,1,0,0,


In [24]:
train_1.fillna(0, inplace=True)
train_2.fillna(0, inplace=True)
test_1.fillna(0, inplace=True)
test_2.fillna(0, inplace=True)

In [25]:
train_1.isnull().sum().sum()

0

In [26]:
cat_cols= ['dayofmonth', 'dayofweek', 'weekend',
       'is_month_start', 'is_month_end', 'is_holiday']

    
    

In [27]:
train_1['is_holiday'] = train_1['is_holiday'].astype(np.int64)
train_2['is_holiday'] = train_2['is_holiday'].astype(np.int64)
test_1['is_holiday'] = test_1['is_holiday'].astype(np.int64)
test_2['is_holiday'] = test_2['is_holiday'].astype(np.int64)

In [28]:
model1 = CatBoostRegressor(iterations=500,
                          learning_rate=0.05,
                          eval_metric='SMAPE',
                          depth=4)

In [29]:
model2 = CatBoostRegressor(iterations=500,
                          learning_rate=0.05,
                          eval_metric='SMAPE',
                          depth=4)

In [30]:
train_1.isnull().sum().sum()

0

In [31]:
model1.fit(train_1[:140], y1[:140],
            eval_set=(train_1[140:],y1[140:]),
             cat_features=cat_cols,
             use_best_model=True,
             verbose=40)

0:	learn: 27.7679158	test: 25.8788957	best: 25.8788957 (0)	total: 60.2ms	remaining: 30s
40:	learn: 19.8178992	test: 17.8303356	best: 17.7805758 (37)	total: 98.9ms	remaining: 1.11s
80:	learn: 18.5521996	test: 17.3637443	best: 17.3476082 (78)	total: 124ms	remaining: 644ms
120:	learn: 17.3498275	test: 17.5348165	best: 17.3476082 (78)	total: 150ms	remaining: 469ms
160:	learn: 16.3196075	test: 17.6155244	best: 17.3476082 (78)	total: 178ms	remaining: 375ms
200:	learn: 15.5230892	test: 17.5358847	best: 17.3476082 (78)	total: 208ms	remaining: 310ms
240:	learn: 14.7132724	test: 17.6854902	best: 17.3476082 (78)	total: 243ms	remaining: 261ms
280:	learn: 14.0557988	test: 18.1353746	best: 17.3476082 (78)	total: 300ms	remaining: 234ms
320:	learn: 13.5631015	test: 18.4242547	best: 17.3476082 (78)	total: 332ms	remaining: 185ms
360:	learn: 12.9997171	test: 18.5001867	best: 17.3476082 (78)	total: 373ms	remaining: 144ms
400:	learn: 12.5686418	test: 18.6773953	best: 17.3476082 (78)	total: 409ms	remaining:

<catboost.core.CatBoostRegressor at 0x1261ae250>

In [32]:
s1 = model1.predict(test_1)

In [33]:
s1

array([2155.2326495 , 2129.24554383, 3572.26740115, 3205.52836556,
       3452.11122849, 3386.57311838, 3410.40585998, 2164.41068941,
       2096.27146995, 3547.57610749, 3261.07637519, 3447.55819635,
       3386.57311838, 3410.40585998, 2155.2326495 , 2182.53954712,
       3570.4572889 , 3215.04475016, 3393.00873374, 3368.24133332,
       3355.85639737, 2160.02278876, 2118.60022335, 3516.92073796,
       3207.48867395, 4143.50875935, 1682.86296305, 3414.95889212,
       2164.41068941, 2096.27146995, 3572.52645525, 3252.099176  ,
       3452.11122849, 3386.57311838, 3326.69868972, 2164.41068941,
       2120.06750392, 3561.32775607, 3261.07637519, 3422.86690269,
       3351.83743933, 3414.95889212, 2181.76387252, 2120.06750392,
       3572.26740115, 3252.099176  , 3393.00873374, 3384.37175328,
       3355.85639737, 2216.20842329, 2118.60022335, 3504.12891972,
       3249.8978109 , 3428.33115159, 3343.87817414, 3352.29712553,
       3682.02946058, 2324.50897325, 3561.58681017, 3261.07637

In [34]:
model2.fit(train_2[:140], y2[:140],
            eval_set=(train_2[140:],y2[140:]),
             cat_features=cat_cols,
             use_best_model=True,
             verbose=40)

0:	learn: 65.5004171	test: 67.1406994	best: 67.1406994 (0)	total: 3.36ms	remaining: 1.68s
40:	learn: 46.9231026	test: 40.0268679	best: 40.0268679 (40)	total: 64ms	remaining: 716ms
80:	learn: 41.3127905	test: 30.9341625	best: 30.9341625 (80)	total: 91.4ms	remaining: 473ms
120:	learn: 37.3966313	test: 27.0233143	best: 27.0233143 (120)	total: 117ms	remaining: 366ms
160:	learn: 35.6251602	test: 25.0024619	best: 25.0024619 (160)	total: 146ms	remaining: 308ms
200:	learn: 35.1779477	test: 23.5505541	best: 23.4409187 (190)	total: 172ms	remaining: 256ms
240:	learn: 34.2593602	test: 22.9522121	best: 22.9522121 (240)	total: 211ms	remaining: 227ms
280:	learn: 33.3412641	test: 22.9029411	best: 22.4860039 (268)	total: 238ms	remaining: 185ms
320:	learn: 32.9019487	test: 22.4618068	best: 22.3845261 (316)	total: 270ms	remaining: 150ms
360:	learn: 32.0343711	test: 22.7107998	best: 22.3845261 (316)	total: 295ms	remaining: 114ms
400:	learn: 31.4529613	test: 22.1503972	best: 22.1503972 (400)	total: 321ms	r

<catboost.core.CatBoostRegressor at 0x1261ae7d0>

In [35]:
s2 = model2.predict(test_2)

In [36]:
s2

array([27662.86037289, 25879.66586394, 28136.57374079, 27516.66932896,
       22805.64865902, 19769.06791195,  8728.62401038,  1924.71724919,
        5308.13070229,  5856.58072561,  5992.05960395,  4189.59480059,
        4858.02115297,  4881.64609773,  5723.97653567,  5887.82908813,
        5938.07146586,  6073.55034419, 17822.00848635, 24952.77733186,
       29000.21112357, 28425.62656497, 25702.98746709, 26832.38399944,
       25072.76096996, 17476.3095926 , 28703.14476699, 25131.77662511,
       27774.5802288 , 24776.18140686, 27345.19497536, 26817.47969905,
       18335.65018066, 26787.58635493, 25336.62695355, 25647.51063602,
       19981.89932245,  8918.6158303 ,  3000.24028662,  3673.92561357,
        3896.51218324,  4800.15535748,  5805.46727591,  5480.97899648,
        5938.07146586,  5955.86793007,  3800.18334311,  4939.51189321,
        4425.20421905, 21046.54123226, 26431.96851429, 29859.71469286,
       30115.71389621, 21289.74796677, 25490.580605  , 25131.77662511,
      

In [37]:
sub_1 = sample[sample['segment'] == 1]
sub_2 = sample[sample['segment'] == 2]

In [38]:
sub_1['case_count'] = s1
sub_2['case_count'] = s2

In [39]:
submission = pd.concat([sub_1,sub_2])
submission

Unnamed: 0,id,application_date,segment,case_count
0,1,2019-07-06,1,2155.232649
1,2,2019-07-07,1,2129.245544
2,3,2019-07-08,1,3572.267401
3,4,2019-07-09,1,3205.528366
4,5,2019-07-10,1,3452.111228
...,...,...,...,...
175,176,2019-10-20,2,19656.188781
176,177,2019-10-21,2,28098.429395
177,178,2019-10-22,2,25661.328548
178,179,2019-10-23,2,25295.136581


In [40]:
submission.to_csv('submission.csv',index=False)