In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization

Using TensorFlow backend.


In [3]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_west_nile_train.csv', 
                       parse_dates=['Date'])
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_west_nile_test.csv', 
                      parse_dates=['Date'], index_col='Id')
df_weather = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_west_nile_weather.csv', 
                         parse_dates=['Date'])

### Data Munging

In [4]:
## Difference of variables between data
set(df_train.columns) - set(df_test.columns)

{'NumMosquitos', 'WnvPresent'}

In [5]:
### Munge main dataset
## Scaler numerical main data
numerical_feats = df_train.select_dtypes(include=[np.number]).drop(['NumMosquitos', 'WnvPresent'], axis=1).columns
scaler = StandardScaler()
df_train.loc[:, numerical_feats] = scaler.fit_transform(df_train.loc[:, numerical_feats])
df_test.loc[:, numerical_feats] = scaler.transform(df_test.loc[:, numerical_feats])

## Convert categorical features of main data
categorical_feats = df_train.select_dtypes(include=['object']).columns

df_train.loc[:, categorical_feats] = df_train.loc[:, categorical_feats].apply(lambda x: pd.factorize(x)[0])
df_test.loc[:, categorical_feats] = df_test.loc[:, categorical_feats].apply(lambda x: pd.factorize(x)[0])


### Munge Weather data
def clean_weather_data(df):
    exclude_date = df.drop('Date', 1).columns.tolist()
    df[exclude_date] = df[exclude_date].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(np.nan)
    return df

weather = clean_weather_data(df_weather)
station_1 = weather.loc[weather['Station'] == 1].drop('Station', 1)
station_2 = weather.loc[weather['Station'] == 2].drop('Station', 1)
weather = pd.merge(station_1, station_2, on='Date').dropna(axis=1)

## Scale numerical weather data
numerical_weather_feats = weather.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
weather.loc[:, numerical_weather_feats] = scaler.fit_transform(weather.loc[:, numerical_weather_feats])

## Join weather data to main data
train = pd.merge(df_train, weather, on='Date').drop('NumMosquitos', 1)
test = pd.merge(df_test, weather, on='Date')

## Final feature engineering
def extract_date_features(df):
    df['dayofweek'] = df.Date.dt.dayofweek
    df['dayofyear'] = df.Date.dt.dayofyear
    df['month'] = df.Date.dt.month
    df['year'] = df.Date.dt.year
    df = df.drop('Date', 1)
    return df

train = extract_date_features(train)
test = extract_date_features(test)

In [6]:
## Split into X and y 
X_train = train.drop('WnvPresent', 1).as_matrix()
X_test = test.as_matrix()
y_train = np.ravel(train['WnvPresent'])

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(10506, 31)
(116293, 31)
(10506,)


### Model Development

In [7]:
# Initialize model
# Dummy 
model = Sequential()
model.add(Dense(32, input_dim=31, init='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, init='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, init='uniform', activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam')

model.fit(X_train, y_train, batch_size=1000, nb_epoch=100, verbose=0)

<keras.callbacks.History at 0x117d62f90>

In [8]:
# Predictions
y_pred_proba = model.predict_proba(X_test)
data = {'Id':df_test.index, 'WnvPresent':y_pred_proba[:,0]}
df_nn = pd.DataFrame(data)
# df_nn.to_csv('/Users/dominicdebiaso/Desktop/kaggle_west_nile_virus_neural_net.csv', index=False)



In [None]:
# Dummy coding of target variable not needed as desired predicted outcome is of binary response.
# Dummifying it would create perfect multicollinearity