In [26]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xbg
from sklearn.linear_model import LogisticRegressionCV

from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import time


In [27]:

def testing_grounds(X_train2, y_train2, X_test2, y_test2, model_options, model_params):
    
    """
    Function that takes in multiple parawwmeters for cleaning type, vectorizer, model, and hyperparameter tuning and tests all possible combinations.
    
   
    
    :param model_options: a dictionary of models to test.
    :Example: 
    {'logistic_regression':LogisticRegression(), 'multinomial_nb':MultinomialNB()}
    
    :param model_params: a dictionary of model parameters to test.
    :Example: {'model_name':{ 'param_name':[param options] }}
    
    
    :return: dictionary 2 levels deep with all passed options as keys and the best parameters and scores for each combination.
    :Example: best_runs[model_options]['score','params']
    """
    num_grids = len(model_options)
    print("Total Number of Gridsearches:", num_grids)
    index=1
    best_runs = defaultdict(dict)
    # [stem/lem/none] [logistic/multinomial] [count/tfidf] [score/params]



   # X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state = 42)


    #testing the models
    for model_k, model_v in model_options.items():


        parameters = {}

        for params_k, params_v in model_params.items():
            if params_k == model_k:
                for param_k, param_v in params_v.items():
                    parameters['model'] = param_v #model__'+param_k


        pipeline = Pipeline([
            ('model', model_v)
        ])

        grid_search = GridSearchCV(pipeline, parameters, verbose=1, n_jobs=3)

        print(f"Performing grid search #{index} of {num_grids}...")
        index+=1
        print(f"Pipeline: {model_k}\n")
        print("Parameters:")
        display(parameters)
        t0 = time.time()
        grid_search.fit(X_train2, y_train2)
        print("Done in %0.3fs" % (time.time() - t0))
        print()

        print("Best train score: %0.3f" % grid_search.best_score_)
        print("Best test score: %0.3f" % grid_search.score(X_test2, y_test2))
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        best_runs[model_k]['score_train'] = grid_search.best_score_
        best_runs[model_k]['params'] = best_parameters
        best_runs[model_k]['model'] = grid_search.best_estimator_
        best_runs[model_k]['score_test'] = grid_search.score(X_test2, y_test2)
#         for param_name in sorted(parameters.keys()):
#             print("\t%s: %r" % (param_name, best_parameters[param_name]))
#             best_runs[model_k]['params'] = best_parameters
            
        print('\n\n')
    
    return best_runs

In [39]:
import sys
def best_model(best_runs):
    best_train_score = -sys.maxsize + 10
    best_test_score = -sys.maxsize + 10
    best_train_model = ''
    best_test_model = ''
    best_params = None
    best_model = None
    for model_k, model_v in best_runs.items():

        if model_v['score_train'] > best_train_score:
            best_train_score = model_v['score_train']
            best_train_model = model_k
            
        if model_v['score_test'] > best_test_score:
            best_test_score = model_v['score_test']
            best_test_model = model_k

    print(f'Best Train Score:  {best_train_score} from model {best_train_model}')
    print(f'Best Test Score:  {best_test_score} from model {best_test_model}')
    print('Best Model:      ')
    display(best_model)
    print()
    print('Best Params:     ')
    display(best_params)
    print()

In [29]:
# Load dataset 
train = pd.read_csv('../assets/train_cleaned.csv')
test = pd.read_csv('../assets/test_cleaned.csv')
sample = pd.read_csv('../assets/sampleSubmission.csv')


In [30]:
#test train split the train off of date, multiple years' input as train and test is the last year (2013?)
#How can I test the values I get back? the labels!
mask = train['year']==2013
X_test = train[mask]
X_train = train[~mask]


In [31]:
X_test['year'].unique()

array([2013])

In [32]:
X_train['year'].unique()

array([2007, 2009, 2011])

In [33]:
# Get Labels
labels_entire = train.WnvPresent.values
labels_train = X_train.WnvPresent.values
labels_test = X_test.WnvPresent.values

In [34]:
# drop address columns, wnv present, num mosquitos, and year
train = train.drop(['WnvPresent', 'NumMosquitos', 'year'], axis = 1)
X_train = X_train.drop(['WnvPresent', 'NumMosquitos', 'year'], axis = 1)
X_test = X_test.drop(['WnvPresent', 'NumMosquitos', 'year'], axis = 1)


train = train.drop(['Address', 'AddressNumberAndStreet'], axis = 1)
X_train = X_train.drop(['Address', 'AddressNumberAndStreet'], axis = 1)
X_test = X_test.drop(['Address', 'AddressNumberAndStreet'], axis = 1)

test = test.drop([ 'Address', 'AddressNumberAndStreet'], axis = 1)

## Initial Model Test - Random Forest Classifier

In [35]:
# Try a Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=2)
clf.fit(X_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
clf.score(X_test, labels_test)

0.9000836120401338

In [37]:
# testing a bunch of models in the background while I try to feature engineer. No real hyper-parameter tuning
n_est = [100, 500, 1000, 1500]


model_options = {
    'logistic_regression':LogisticRegressionCV(random_state=42, solver='sag', max_iter = 10000),
    
    #The regressors did not do as well as the classifiers
    
    'linreg':LinearRegression(),
    'xgboost':xbg.XGBRegressor(),
    'random_forest_regressor':RandomForestRegressor(n_estimators=n_est),
    'extra_trees_regressor':ExtraTreesRegressor(n_estimators=n_est),
    'ada_boost_regressor':AdaBoostRegressor(n_estimators=n_est),
    'gradient_regressor':GradientBoostingRegressor(n_estimators=n_est),
   
    'random_forest_classifier':RandomForestClassifier(n_estimators=n_est),
    'extra_trees_classifier':ExtraTreesClassifier(n_estimators=n_est),
    'ada_boost_classifier':AdaBoostClassifier(n_estimators=n_est),
    'gradient_classifier':GradientBoostingClassifier()
    
}
model_params = {
    'linreg':{
        
    },
    'xgboost':{
        
    },
    'random_forest_regressor':{
        
    },
    'extra_trees_regressor':{
        
    },
    'ada_boost_regressor':{
        
    },
    'gradient_regressor':{
        
    },
   
    'random_forest_classifier':{
        
    },
    'extra_trees_classifier':{
        
    },
    'ada_boost_classifier':{
        
    },
    'gradient_classifier':{
        
    }
}

best_runs = testing_grounds(X_train, labels_train, X_test, labels_test, model_options, model_params)

Total Number of Gridsearches: 5
Performing grid search #1 of 5...
Pipeline: logistic_regression

Parameters:


{}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   37.3s finished


Done in 86.688s

Best train score: 0.897
Best test score: 0.900
Best parameters set:



Performing grid search #2 of 5...
Pipeline: random_forest_classifier

Parameters:


{}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.3s finished


Done in 8.959s

Best train score: 0.607
Best test score: 0.900
Best parameters set:



Performing grid search #3 of 5...
Pipeline: extra_trees_classifier

Parameters:


{}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.2s finished


Done in 6.701s

Best train score: 0.728
Best test score: 0.900
Best parameters set:



Performing grid search #4 of 5...
Pipeline: ada_boost_classifier

Parameters:


{}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.5s finished


Done in 9.780s

Best train score: 0.586
Best test score: 0.900
Best parameters set:



Performing grid search #5 of 5...
Pipeline: gradient_classifier

Parameters:


{}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished


Done in 1.150s

Best train score: 0.487
Best test score: 0.900
Best parameters set:





In [40]:
for k, v in best_runs.items():
    print(f'Model: {k}')
    print(f'    Train Score: {v["score_train"]}')
    print(f'    Test Score: {v["score_test"]}')
    
print('\n\n\n\n\n')
best_model(best_runs)

Model: logistic_regression
    Train Score: 0.8969682031057432
    Test Score: 0.9000836120401338
Model: random_forest_classifier
    Train Score: 0.6070988415085038
    Test Score: 0.9000836120401338
Model: extra_trees_classifier
    Train Score: 0.7283707172787774
    Test Score: 0.9000836120401338
Model: ada_boost_classifier
    Train Score: 0.5860241557801331
    Test Score: 0.9000836120401338
Model: gradient_classifier
    Train Score: 0.4868129159477446
    Test Score: 0.9000836120401338






Best Train Score:  0.8969682031057432 from model logistic_regression
Best Test Score:  0.9000836120401338 from model logistic_regression
Best Model:      


None


Best Params:     


None




In [41]:
# Highest scoring model in the function
logreg = LogisticRegressionCV(random_state=42, solver='sag', max_iter = 10000)
logreg.fit(X_train, labels_train)
logreg.score(X_test, labels_test)


0.9000836120401338

In [42]:
logrg2 = best_runs['logistic_regression']['model']
logrg2.fit(X_train, labels_train)
logrg2.score(X_test, labels_test)

0.9000836120401338

In [43]:
logrg2.score(X_train, labels_train)

0.9615479418289377

In [None]:
# These two cells are for the test submission to kaggle

# Random Forest Classifier 
clf_final = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=2)
clf_final.fit(train, labels_entire)

In [None]:
# Want to try submitting all of the models' predictions?
for model_k, model_v in best_runs.items():
    predictions = model_v['model'].predict_proba(test.drop(columns='Id'))[:,1]
    sample['WnvPresent'] = predictions
    sample.to_csv('../assets/submission_all_'+model_k'.csv', index=False)

predictions = logrg2.predict_proba(test.drop(columns='Id'))[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('../assets/submission_logrg2.csv', index=False)
    
predictions = clf_final.predict_proba(test.drop(columns='Id'))[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('../assets/submission_clf_final.csv', index=False)

In [None]:
X_train.shape[1], test.shape[1]

In [None]:
# create predictions and submission file
predictions = clf_final.predict_proba(test.drop(columns='Id'))[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('../assets/submission_4_randomforest.csv', index=False)

In [None]:
result_df = test.merge(sample, on='Id')

In [None]:
result_df.head(10)

In [None]:
minimum = result_df['WnvPresent'].min()
maximum = result_df['WnvPresent'].max()

In [None]:
midpoint = ((maximum - minimum)/2) + minimum
new_results = result_df[result_df['WnvPresent']>=midpoint]

In [None]:
new_results=new_results.sort_values('WnvPresent', ascending=False)
new_results.head()

In [None]:
#Visualization of ALL the areas where we predicted we should spray
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("../assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = result_df[['Trap','Longitude', 'Latitude', 'WnvPresent']]

# these are needed for plotting densities over map image,
# it changes alpha channel?

# see,
# Meaning of the colormap._lut list in matplotlib.color
# http://stackoverflow.com/questions/18035411/meaning-of-the-colormap-lut-list-in-matplotlib-color

alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]


aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1) # xmin, xmax, ymin, ymax

sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
plt.imshow(zv, 
           origin='lower', 
           cmap=alpha_cm, 
           extent=lon_lat_box, 
           aspect=aspect)

# -> how to use 'extent' in matplotlib.pyplot.imshow
# http://stackoverflow.com/questions/6999621/how-to-use-extent-in-matplotlib-pyplot-imshow

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')

plt.savefig('total_predictions_heatmap.png')

In [None]:
#Visualization of the TOP 50% of areas where we predicted we should spray
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("../assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = new_results[['Trap','Longitude', 'Latitude', 'WnvPresent']]

# these are needed for plotting densities over map image,
# it changes alpha channel?

# see,
# Meaning of the colormap._lut list in matplotlib.color
# http://stackoverflow.com/questions/18035411/meaning-of-the-colormap-lut-list-in-matplotlib-color

alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]


aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1) # xmin, xmax, ymin, ymax

sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
plt.imshow(zv, 
           origin='lower', 
           cmap=alpha_cm, 
           extent=lon_lat_box, 
           aspect=aspect)

# -> how to use 'extent' in matplotlib.pyplot.imshow
# http://stackoverflow.com/questions/6999621/how-to-use-extent-in-matplotlib-pyplot-imshow

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')

plt.savefig('top_50_percent_predictions_heatmap.png')