In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

In [2]:
# Load dataset 
train = pd.read_csv('../assets/train.csv')
test = pd.read_csv('../assets/test.csv')
sample = pd.read_csv('../assets/sampleSubmission.csv')
weather = pd.read_csv('../assets/weather.csv')

In [3]:
# Get labels
labels_entire = train.WnvPresent.values

In [4]:
# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

In [5]:
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

In [6]:
# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [7]:
# Functions to extract month and day from dataset
# You can also use parse_dates of Pandas.
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

def create_year(x):
    return x.split('-')[0]

In [8]:
train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
train['year'] = train.Date.apply(create_year)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)


# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

In [9]:
# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')

train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)


In [10]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

In [11]:
lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

In [12]:
lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

In [13]:
(train != -1).any(axis=0).head()

Address    True
Species    True
Block      True
Street     True
Trap       True
dtype: bool

In [14]:
# drop columns with -1s
train = train.loc[:,(train != -1).any(axis=0)]
test = test.loc[:,(test != -1).any(axis=0)]

In [16]:
train[train==-1].sum()

Address                      0.0
Species                      0.0
Block                        0.0
Street                       0.0
Trap                         0.0
AddressNumberAndStreet       0.0
Latitude                     0.0
Longitude                    0.0
AddressAccuracy              0.0
NumMosquitos                 0.0
WnvPresent                   0.0
month                        0.0
day                          0.0
year                         0.0
Lat_int                      0.0
Long_int                     0.0
Tmax_x                       0.0
Tmin_x                       0.0
Tavg_x                       0.0
Depart_x                     0.0
DewPoint_x                   0.0
WetBulb_x                  -93.0
Heat_x                       0.0
Cool_x                       0.0
Sunrise_x                    0.0
Sunset_x                     0.0
Depth_x                      0.0
SnowFall_x                -178.0
PrecipTotal_x            -1176.0
StnPressure_x              -93.0
SeaLevel_x

In [17]:
#test train split the train off of date, multiple years' input as train and test is the last year (2013?)
#How can I test the values I get back? the labels!
mask = train['year']=='2013'
X_test = train[mask]
X_train = train[~mask]


In [18]:
X_test['year'].unique()

array(['2013'], dtype=object)

In [19]:
X_train['year'].unique()

array(['2007', '2009', '2011'], dtype=object)

In [20]:
labels_train = X_train.WnvPresent.values
labels_test = X_test.WnvPresent.values

In [21]:
# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos', 'year'], axis = 1)
X_train = X_train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos', 'year'], axis = 1)
X_test = X_test.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos', 'year'], axis = 1)

test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

In [22]:
# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=2)
clf.fit(X_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
clf.score(X_test, labels_test)

0.9000836120401338

In [24]:
# These two cells are leftover from the guys code

# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=2)
clf.fit(train, labels_entire)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
# create predictions and submission file
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('../assets/submission_1.csv', index=False)