In [2]:
import numpy as np
import pandas as pd

# Training Data

In [3]:
train = pd.read_csv('assets/train.csv')

In [4]:
train["Date"] = pd.to_datetime(train["Date"], infer_datetime_format=True)
train['Address'] = train['Address'].astype('category')
train['Species'] = train['Species'].astype('category')
train['Street'] = train['Street'].astype('category')
train['Trap'] = train['Trap'].astype('category')
train['AddressNumberAndStreet'] = train['AddressNumberAndStreet'].astype('category')

# Weather Data

In [10]:
weather = pd.read_csv('nmo-distilled-data/weather-nmo.csv', index_col=0)

In [11]:
# We need to create one line per date:
station1 = weather[weather['Station']==1]
station2 = weather[weather['Station']==2]
station1 = station1.drop('Station', axis=1)
station2 = station2.drop('Station', axis=1)

In [12]:
station1.columns = ['Date', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_CodeSum',
       'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed',
       'st1_ResultDir', 'st1_AvgSpeed', 'st1_Lat', 'st1_Long']
station2.columns = ['Date', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_CodeSum',
       'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed',
       'st2_ResultDir', 'st2_AvgSpeed', 'st2_Lat', 'st2_Long']

In [13]:
weather = pd.merge(station1, station2, on='Date')
weather["Date"] = pd.to_datetime(weather["Date"], infer_datetime_format=True)

In [14]:
# Feature engineer us some over time weather data
weather = weather.set_index('Date')

In [15]:
weather['precip_avg'] = (weather['st1_PrecipTotal'] + weather['st2_PrecipTotal'])/2
weather['2wk_precip'] = weather['precip_avg'].rolling(14, min_periods=1).sum()
weather['4wk_precip'] = weather['precip_avg'].rolling(28, min_periods=1).sum()
weather['90day_precip'] = weather['precip_avg'].rolling(90, min_periods=1).sum()

In [16]:
weather['temp_avg'] = (weather['st1_Tavg'] + weather['st2_Tavg'])/2
weather['2wk_tavg'] = weather['temp_avg'].rolling(14, min_periods=1).mean()
weather['4wk_tavg'] = weather['temp_avg'].rolling(28, min_periods=1).mean()
weather['90day_tavg'] = weather['temp_avg'].rolling(90, min_periods=1).mean()

In [17]:
weather['tempmin_avg'] = (weather['st1_Tmin'] + weather['st2_Tmin'])/2
weather['2wk_mintemp'] = weather['tempmin_avg'].rolling(14, min_periods=1).min()
weather['4wk_mintemp'] = weather['tempmin_avg'].rolling(28, min_periods=1).min()

In [18]:
weather['dew_avg'] = (weather['st1_DewPoint'] + weather['st2_DewPoint'])/2
weather['2wk_dew'] = weather['dew_avg'].rolling(14, min_periods=1).mean()
weather['4wk_dew'] = weather['dew_avg'].rolling(28, min_periods=1).mean()

In [19]:
weather = weather.reset_index()
train = pd.merge(train, weather, how='left', on='Date')

# Categories

In [20]:
final_df = pd.get_dummies(train, columns=['Species'])

# Time

In [21]:
final_df['Month'] = final_df['Date'].dt.month
final_df["Day"] = final_df['Date'].dt.dayofyear

# Location Info

In [22]:
#our two origins (the locations with the most WNV activity) are Chicago O'Hare and Doty Ave.
#the following values are their latitudes and longitudes
ohare_lon = -87.890615
ohare_lat = 41.974689
doty_lon =-87.599862
doty_lat=41.673408

In [23]:
lat = train.Latitude
lon = train.Longitude

In [24]:
#haversine takes two lat and longs and creates a distance, from the mean, in miles
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mi = 3956   * c #Radius of earth in miles. Use 6367 for kilometers
    return mi, dlon, dlat

In [25]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
final_df['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
final_df['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [26]:
test_features = final_df[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]
target = final_df.WnvPresent

# Scale stuff

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
scale = StandardScaler()

In [29]:
test_features = pd.DataFrame(scale.fit_transform(test_features), columns=test_features.columns)

# Model Time

In [30]:
from sklearn.model_selection import train_test_split, cross_val_score

In [31]:
X_train, X_test, y_train, y_test = train_test_split(test_features, target, test_size=0.3, random_state=42)

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score

def eval_sklearn_model(y_true, predictions, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')

In [33]:
from xgboost import XGBClassifier

ImportError: No module named 'xgboost'

In [34]:
xgb = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic')
# make sure to pick the correct objective for the problem
# scale_pos_weight is supposed to help with unbalanced classes; it recommended number of negative cases divided by positive
xgb.fit(X_train, y_train)

NameError: name 'XGBClassifier' is not defined

In [35]:
test_predictions = xgb.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=xgb, X=X_test)

NameError: name 'xgb' is not defined

# Set up test data and export

In [36]:
test = pd.read_csv('assets/test.csv')

In [37]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)

In [38]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)
test['Address'] = test['Address'].astype('category')
test['Species'] = test['Species'].astype('category')
test['Street'] = test['Street'].astype('category')
test['Trap'] = test['Trap'].astype('category')
test['AddressNumberAndStreet'] = test['AddressNumberAndStreet'].astype('category')

In [39]:
test = pd.merge(test, weather, how='left', on='Date')

In [40]:
test = pd.get_dummies(test, columns=['Species'])

In [41]:
test['Month'] = test['Date'].dt.month
test["Day"] = test['Date'].dt.dayofyear

In [42]:
lat = test.Latitude
lon = test.Longitude

In [43]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
test['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
test['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [44]:
# Make match above
features = test[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]

In [45]:
pred_features = pd.DataFrame(scale.fit_transform(features), columns=features.columns)

In [46]:
# Whatever model you decided on:
predictions = xgb.predict(pred_features)

NameError: name 'xgb' is not defined

In [47]:
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
submission = submission.set_index('Id')
submission.to_csv('submission.csv',)

NameError: name 'predictions' is not defined

# NN?

In [49]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [50]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7354, 51)
(7354,)
(3152, 51)
(3152,)


In [51]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

In [52]:
model = Sequential()
model.add(Dense(2048, input_shape=(51,)))
model.add(Activation('relu'))
                           
model.add(Dropout(0.2))   # Dropout helps protect the model from memorizing or "overfitting" the training data
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [53]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', ])

In [54]:
model.fit(X_train, y_train, epochs=20,
          verbose=1, validation_data=(X_test, y_test), class_weight={0:1, 1:6})

Train on 7354 samples, validate on 3152 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x12c113ef0>

In [55]:
score = model.evaluate(X_test, y_test, verbose=1)

print('Test score:', score)
print('Test metric:', model.metrics_names)

Test score: [0.25433369033832842, 0.90038071065989844]
Test metric: ['loss', 'acc']


In [56]:
# The predict_classes function outputs the highest probability class
# according to the trained classifier for each input example.
predicted_classes = model.predict_classes(X_test)



In [57]:
# Check which items we got right / wrong
correct_indices = np.nonzero(predicted_classes.reshape(3152,) == np.array(y_test))[0]
incorrect_indices = np.nonzero(predicted_classes.reshape(3152,) != np.array(y_test))[0]

In [58]:
true_pos = ((predicted_classes.reshape(3152,) == np.array(y_test)) & (np.array(y_test) == 1)).sum()
true_neg = ((predicted_classes.reshape(3152,) == np.array(y_test)) & (np.array(y_test) == 0)).sum()

In [59]:
false_pos = ((predicted_classes.reshape(3152,) != np.array(y_test)) & (np.array(y_test) == 1)).sum()
false_neg = ((predicted_classes.reshape(3152,) != np.array(y_test)) & (np.array(y_test) == 0)).sum()

In [60]:
print('True Negative: ', true_neg, '| False Positive: ', false_pos)
print('False Negative: ', false_neg, '| True Positive: ', true_pos, '\n')
sensitivity = true_pos/ (true_pos + false_neg)
specificity = true_neg/ (true_neg + false_pos)
print('Sensitivity (TP/ TP + FN): ', sensitivity)
print('Specificity (TN/ TN + FP): ', specificity)

True Negative:  2767 | False Positive:  95
False Negative:  219 | True Positive:  71 

Sensitivity (TP/ TP + FN):  0.244827586207
Specificity (TN/ TN + FP):  0.966806429071


# Train on full data

In [61]:
X = test_features.as_matrix()
y = target

In [62]:
model.fit(X, y, epochs=30,
          verbose=1, class_weight={0:1, 1:8})

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x12f476eb8>

In [63]:
X_pred = pred_features.as_matrix()

In [64]:
predictions = model.predict_classes(X_pred)



In [65]:
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
submission = submission.set_index('Id')
submission.to_csv('submission.csv',)

In [66]:
submission.iloc[0]['WnvPresent'][0]

0

In [67]:
submission['WnvPresent'] = submission['WnvPresent'].apply(lambda x: x[0])

In [68]:
submission['WnvPresent'].value_counts()

0    107160
1      9133
Name: WnvPresent, dtype: int64

In [69]:
submission.to_csv('submission.csv',)

In [70]:
# Kaggle score of 0.65394