In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
train_file = pd.read_csv('train.csv')
test_file = pd.read_csv('test.csv')

In [3]:
pressure_fea=['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure','Min_Atmospheric_Pressure']
drop_fea=['ID','Date','Location_Type']
drop_fea.extend(pressure_fea)
drop_fea=list(set(drop_fea))
drop_fea

['Min_Atmospheric_Pressure',
 'Max_Atmospheric_Pressure',
 'Average_Atmospheric_Pressure',
 'Location_Type',
 'Date',
 'ID']

In [4]:
def get_wind_dir(x):
	if 315<x<=45: return 0
	if 45<x<=135: return 1
	if 135<x<=225: return 2
	if 225<x<=315: return 3
	else: return -1



In [5]:
def get_normal_date(x):
	if 0<x<=10: return x
	if 10<x<=20: return 15
	if 23<x<=25: return 24
	if 29<x<=31: return 30
        else: return x 

In [6]:
dfmean=0
def preprocess(file,istrian):
	df=pd.read_csv(file,parse_dates=['Date'],dayfirst=True)
	end_missing=['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure',
	'Min_Atmospheric_Pressure','Min_Ambient_Pollution','Max_Ambient_Pollution']
	# df=df.fillna(-1)
	df=df.fillna(method='pad')
	if istrian:
		global dfmean
		dfmean=df.mean()
		df=df.fillna(dfmean)
		df=df[df.Park_ID!=19]
		outcome=df.Footfall
		df=df.drop(['Footfall'],axis=1)
	else:
		df=df.fillna(dfmean)
		outcome=np.nan

	df['month']=df['Date'].apply(lambda x: x.month)
	df['date']=df['Date'].apply(lambda x: x.day)
	df['sardiya']=df['month'].apply(lambda x: 1 if x in [1,2,11,12,3] else 0)
	df.date=df.date.apply(get_normal_date)
	df['Direction_Of_Wind2']=df.Direction_Of_Wind.apply(get_wind_dir)

	return df,outcome


In [7]:
def larger_model():
	model = Sequential()
	model.add(Dense(100, input_dim=16, init='normal', activation='relu'))
	model.add(Dense(1, init='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
        return model

In [8]:
#load training dataset
train,outcome=preprocess('train.csv',True)
parkids=train.Park_ID
tardates=train.Date
ids_train=train.ID

In [40]:
train.head()

Unnamed: 0,Park_ID,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,month,date,sardiya,Direction_Of_Wind2,pressure
0,12,194.0,37.24,60.8,15.2,92.13,92.0,304.0,255.0,288.0,222.0,9,1,0,2,128.980336
1,12,285.0,32.68,60.8,7.6,14.11,172.0,332.0,252.0,297.0,204.0,9,2,0,3,117.39712
2,12,319.0,43.32,60.8,15.2,35.69,236.0,292.0,219.0,279.0,165.0,9,3,0,-1,-22.065047
3,12,297.0,25.84,38.0,7.6,0.0249,272.0,324.0,225.0,261.0,192.0,9,4,0,3,-123.735131
4,12,207.0,28.88,45.6,7.6,0.83,236.0,332.0,234.0,273.0,183.0,9,5,0,2,-103.83558


In [39]:
#dimentionality reduction on pressure features
pca2=PCA(1)
train['pressure']=pca2.fit_transform(train[pressure_fea])

train.drop(drop_fea,axis=1,inplace=True)


In [45]:
estimators

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('mlp', <keras.wrappers.scikit_learn.KerasRegressor at 0x7fea86f7b6d0>)]

In [41]:
# fix random seed for reproducibility
seed = 7
print 'classification start'
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, nb_epoch=40, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
clf=pipeline
clf.fit(train,outcome)

classification start
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
 13540/111538 [==>...........................] - ETA: 21s - loss: 10496.4314

KeyboardInterrupt: 

In [43]:
#load test datast
test,faaltu=preprocess('test.csv',False)
ids=test.ID
parkids_test=test.Park_ID
test['pressure']=pca2.transform(test[pressure_fea])

In [44]:
test.drop(drop_fea,axis=1,inplace=True)
pred=clf.predict(test)
out_df=pd.DataFrame({'ID':ids,'Footfall':pred})

#save submission
out_df.to_csv('intermediate_keras.csv',index=False)

