In [35]:
from datetime import datetime
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta

In [36]:
holidays = [
    '2017-01-01',
    '2017-01-30',
    '2017-04-04',
    '2017-05-01',
    '2017-05-30',
    '2017-07-01',
    '2017-10-01',
    '2017-10-05',
    '2017-10-28',
    '2017-12-22',
    '2017-12-25',
    '2018-01-01',
    '2018-02-16',
    '2018-02-17',
    '2018-02-19',
    '2018-04-05',
    '2018-05-01',
    '2018-06-18',
    '2018-07-01',
    '2018-09-25',
    '2018-10-01',
    '2018-10-17',
    '2018-12-22',
    '2018-12-25',
    '2019-01-01'
]

In [37]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
weather = pd.read_csv('weather_complement.csv')

In [38]:
train['datetime'] = train['date'].apply(lambda date: '-'.join(date.split()[0].split('/')[::-1]) + ' ' + date.split()[1])
test['datetime'] = test['date'].apply(lambda date: '-'.join(date.split()[0].split('/')[::-1]) + ' ' + date.split()[1])
train['datetime'] = pd.to_datetime(train.date,format='%Y%-%m-%d %H:%M',infer_datetime_format=True)
test['datetime'] = pd.to_datetime(test.date,format='%Y-%m-%d %H:%M',infer_datetime_format=True)
train

Unnamed: 0,id,date,speed,datetime
0,0,1/1/2017 0:00,43.002930,2017-01-01 00:00:00
1,1,1/1/2017 1:00,46.118696,2017-01-01 01:00:00
2,2,1/1/2017 2:00,44.294158,2017-01-01 02:00:00
3,3,1/1/2017 3:00,41.067468,2017-01-01 03:00:00
4,4,1/1/2017 4:00,46.448653,2017-01-01 04:00:00
...,...,...,...,...
14001,14001,31/12/2018 12:00,19.865269,2018-12-31 12:00:00
14002,14002,31/12/2018 15:00,17.820375,2018-12-31 15:00:00
14003,14003,31/12/2018 16:00,12.501851,2018-12-31 16:00:00
14004,14004,31/12/2018 18:00,15.979319,2018-12-31 18:00:00


In [39]:
# 格式化weather时间
weather['date'] = pd.to_datetime(weather.date,format='%Y%-%m-%d %H:%M',infer_datetime_format=True)
weather = weather.rename(columns={'date': 'datetime'})
weather.index = weather.datetime
weather.shape[0] - weather.count()

datetime                  0
summary                   0
precipIntensity        5409
precipProbability      5409
temperature               2
apparentTemperature       2
dewPoint                  0
humidity                  0
windSpeed                48
windGust               5481
windBearing              55
uvIndex                 492
visibility               32
dtype: int64

In [40]:
# 清洗 nan
# weather = weather.drop(['precipIntensity', 'precipProbability', 'windGust'], 1)
# weather = weather.fillna(axis=1, method='ffill')
weather = weather.fillna(method='ffill').drop(['precipIntensity', 'precipProbability', 'windGust'], 1)
# weather

In [41]:
np.unique(weather['temperature'])

array([-25.71, -25.65, -25.6 , ...,  35.95,  36.13,  36.38])

In [42]:
# 对weather的summary数值化
# weather.append(pd.get_dummies(weather['summary']), 1)
weather = weather.drop(['datetime'], 1).join(pd.get_dummies(weather['summary']), on='datetime')
# pd.get_dummies(weather['summary'])

In [43]:
np.unique(weather['temperature'])
# weather.drop(['summary'], 1).to_csv('check_nan.csv', header=True, index=False)

array([-25.71, -25.65, -25.6 , ...,  35.95,  36.13,  36.38])

In [44]:
def is_around_holiday(date):
    for holiday in holidays:
        holiday = datetime.strptime(holiday, '%Y-%m-%d')
        if holiday - relativedelta(days=1) <= date < holiday:
            return True
        elif holiday + relativedelta(days=1) <= date < holiday + relativedelta(days=2):
            return True
    return False


def is_holiday(date):
    for holiday in holidays:
        holiday = datetime.strptime(holiday, '%Y-%m-%d')
        if holiday <= date < holiday + relativedelta(days=1):
            return True
    return False


train['around_holiday'] = train['datetime'].apply(is_around_holiday)
test['around_holiday'] = test['datetime'].apply(is_around_holiday)
train['holiday'] = train['datetime'].apply(is_holiday)
test['holiday'] = test['datetime'].apply(is_holiday)
train, test

(          id              date      speed            datetime  around_holiday  \
 0          0     1/1/2017 0:00  43.002930 2017-01-01 00:00:00           False   
 1          1     1/1/2017 1:00  46.118696 2017-01-01 01:00:00           False   
 2          2     1/1/2017 2:00  44.294158 2017-01-01 02:00:00           False   
 3          3     1/1/2017 3:00  41.067468 2017-01-01 03:00:00           False   
 4          4     1/1/2017 4:00  46.448653 2017-01-01 04:00:00           False   
 ...      ...               ...        ...                 ...             ...   
 14001  14001  31/12/2018 12:00  19.865269 2018-12-31 12:00:00            True   
 14002  14002  31/12/2018 15:00  17.820375 2018-12-31 15:00:00            True   
 14003  14003  31/12/2018 16:00  12.501851 2018-12-31 16:00:00            True   
 14004  14004  31/12/2018 18:00  15.979319 2018-12-31 18:00:00            True   
 14005  14005  31/12/2018 20:00  40.594183 2018-12-31 20:00:00            True   
 
        holida

In [45]:
_train = train.join(weather, on='datetime', rsuffix='r').drop(['datetime', 'summary'], 1)
_test = test.join(weather, on='datetime', rsuffix='r').drop(['datetime', 'summary'], 1)
_train.to_csv('train_weather.csv', encoding='utf-8', header=True, index=None)
_test.to_csv('test_weather.csv', encoding='utf-8', header=True, index=None)

_train, _test


(          id              date      speed  around_holiday  holiday  \
 0          0     1/1/2017 0:00  43.002930           False     True   
 1          1     1/1/2017 1:00  46.118696           False     True   
 2          2     1/1/2017 2:00  44.294158           False     True   
 3          3     1/1/2017 3:00  41.067468           False     True   
 4          4     1/1/2017 4:00  46.448653           False     True   
 ...      ...               ...        ...             ...      ...   
 14001  14001  31/12/2018 12:00  19.865269            True    False   
 14002  14002  31/12/2018 15:00  17.820375            True    False   
 14003  14003  31/12/2018 16:00  12.501851            True    False   
 14004  14004  31/12/2018 18:00  15.979319            True    False   
 14005  14005  31/12/2018 20:00  40.594183            True    False   
 
        temperature  apparentTemperature  dewPoint  humidity  windSpeed  ...  \
 0            18.07                18.07      7.59      0.50      