In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/TerryONeill/west_nile/west_nile/assets/train_weather_join.csv')

In [5]:
## here are just a few ways to familiarize ourselves with the dataframe

print 'Shape of Dataframe:\n', df.shape
print
print 'Datatypes of Columns:\n', df.dtypes
print
print 'Null Values for Each Column:\n', df.isnull().sum()

Shape of Dataframe:
(10506, 19)

Datatypes of Columns:
Date                object
Species              int64
Trap                 int64
Latitude           float64
Longitude          float64
AddressAccuracy      int64
NumMosquitos         int64
WnvPresent           int64
day                  int64
month                int64
year                 int64
Tmax               float64
Tmin               float64
Tavg               float64
Depart             float64
DewPoint           float64
PrecipTotal        float64
Sunrise            float64
Sunset             float64
dtype: object

Null Values for Each Column:
Date               0
Species            0
Trap               0
Latitude           0
Longitude          0
AddressAccuracy    0
NumMosquitos       0
WnvPresent         0
day                0
month              0
year               0
Tmax               0
Tmin               0
Tavg               0
Depart             0
DewPoint           0
PrecipTotal        0
Sunrise            0
Sunset    

In [6]:
## checking out the head of the dataframe
df.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,year,Tmax,Tmin,Tavg,Depart,DewPoint,PrecipTotal,Sunrise,Sunset
0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,29,5,2007,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0
1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,29,5,2007,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,29,5,2007,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,29,5,2007,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,29,5,2007,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0


In [7]:
## I am listing specifically the weather columns that I want to perform PCA on so I can
## make a dataframe of just these columns
pca_cols = ['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint']

## creating the dataframe with the specified columns
pca_prep = df[pca_cols]
pca_prep.head()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,DewPoint
0,88.0,62.5,75.5,10.0,58.5
1,88.0,62.5,75.5,10.0,58.5
2,88.0,62.5,75.5,10.0,58.5
3,88.0,62.5,75.5,10.0,58.5
4,88.0,62.5,75.5,10.0,58.5


In [8]:
from sklearn import preprocessing


##Standardizing all the columns
Xstand = preprocessing.StandardScaler().fit_transform(pca_prep)
Xstand

array([[ 0.73533201, -0.15323081,  0.33197309,  1.11926896, -0.15032138],
       [ 0.73533201, -0.15323081,  0.33197309,  1.11926896, -0.15032138],
       [ 0.73533201, -0.15323081,  0.33197309,  1.11926896, -0.15032138],
       ..., 
       [-0.83617307, -1.48892293, -1.20894883,  0.04004779, -0.98269898],
       [-0.83617307, -1.48892293, -1.20894883,  0.04004779, -0.98269898],
       [-0.83617307, -1.48892293, -1.20894883,  0.04004779, -0.98269898]])

In [9]:
## creating the covariance matrix - this explains the variance between the different
## features within our dataframe

## for example, the value in the i,j position within the matrix explains the variance
## between the ith and the jth elements of a random vector, or between our features
cov_mat = np.cov(Xstand.T)

## creating my eigenvalues and corresponding eigenvectors
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

In [10]:
print eigenValues 
print
print
print eigenVectors 
## each eigenvector has 8 components - stemming from 8 total features

[  4.30626718e+00   3.86079903e-01   2.10024710e-01   2.75061471e-04
   9.78291109e-02]


[[ 0.44750416  0.38338706 -0.5719219   0.44803921  0.35344109]
 [ 0.45157019 -0.45472149 -0.0410272   0.40105603 -0.65328674]
 [ 0.47542114 -0.01396616 -0.34398581 -0.79900345 -0.13056388]
 [ 0.42078793  0.62426837  0.63173207 -0.00231691 -0.18475863]
 [ 0.43901783 -0.50629703  0.39218678 -0.00174547  0.63016914]]


In [11]:
## creating the eigenpairs - just pairing the eigenvalue with its eigenvector
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]

## sort in ascending order and then reverse to descending (for clarification's sake)
eigenPairs.sort()
eigenPairs.reverse()

## loop through the eigenpairs and printing out the first row (eigenvalue)
## this is also seen in the code block above but just wanted to loop through again
## as it is a bit more clear like this
## I am also creating a list of the eigenvalues in ascending order to be able to reference it
sort_values = []
for i in eigenPairs:
    print i[0]
    sort_values.append(i[0])
    
sort_values

4.30626717893
0.386079902802
0.210024709766
0.0978291108561
0.000275061470511


[4.306267178932238,
 0.3860799028020393,
 0.21002470976590853,
 0.097829110856051921,
 0.00027506147051062108]

In [12]:
## we have the eigenvalues above showing us feature correlation explanation, but it helps
## to see the cumulative variance explained as well, which i can show below

## need to sum the eigen values to get percentages
sumEigenvalues = sum(eigenValues)

## this is a percentage explanation
variance_explained = [(i/sumEigenvalues)*100 for i in sort_values]
variance_explained

[86.117145849387327,
 7.720863085732768,
 4.2000943767197203,
 1.9563959823773569,
 0.0055007057828175797]

In [13]:
### based on the above results, it seems that sticking to 3 features would be ideal
## this can very easily be manipulated by changing n_components adn then adding/subtracting
## columns to the dataframe in the code block below

## instantiate
pca = PCA(n_components = 3)

## fit and transform the standardized data
pca_cols = pca.fit_transform(Xstand)


In [16]:
## going to organize the columns into dataframe for organization
pca_df = pd.DataFrame(pca_cols, columns = ['pca1', 'pca2', 'pca3'])

##previewing dataframe
print pca_df.shape
pca_df.head()

(10506, 3)


Unnamed: 0,pca1,pca2,pca3
0,-0.822678,1.121789,-0.119664
1,-0.822678,1.121789,-0.119664
2,-0.822678,1.121789,-0.119664
3,-0.822678,1.121789,-0.119664
4,-0.822678,1.121789,-0.119664


In [19]:
## Here I am joining the initial dataframe and the newly created pca dataframe
## together to move forward with

joined_df = df.join(pca_df)

In [21]:
## checking on the dataframe
joined_df

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,...,Tmin,Tavg,Depart,DewPoint,PrecipTotal,Sunrise,Sunset,pca1,pca2,pca3
0,2007-05-29,2,1,41.954690,-87.800991,9,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
1,2007-05-29,3,1,41.954690,-87.800991,9,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
5,2007-05-29,3,32,41.921600,-87.666455,8,2,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
6,2007-05-29,3,33,41.891118,-87.654491,8,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
7,2007-05-29,2,35,41.867108,-87.654224,8,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
8,2007-05-29,3,35,41.867108,-87.654224,8,2,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664
9,2007-05-29,3,36,41.896282,-87.655232,8,1,0,29,5,...,62.5,75.5,10.0,58.5,0.000,421.0,1917.0,-0.822678,1.121789,-0.119664


In [24]:
## Now that I have the PCA columns added on, I want to change the index to the date

## first i need to change the column to date time format
joined_df['Date'] = pd.to_datetime(joined_df['Date'])

## Now I set Date as the index. Inplace=True will modify the original object
joined_df.set_index('Date', drop = True, inplace = True)

In [25]:
## checking out the head
joined_df.head()

Unnamed: 0_level_0,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,year,...,Tmin,Tavg,Depart,DewPoint,PrecipTotal,Sunrise,Sunset,pca1,pca2,pca3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,2,1,41.95469,-87.800991,9,1,0,29,5,2007,...,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,-0.822678,1.121789,-0.119664
2007-05-29,3,1,41.95469,-87.800991,9,1,0,29,5,2007,...,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,-0.822678,1.121789,-0.119664
2007-05-29,3,6,41.994991,-87.769279,9,1,0,29,5,2007,...,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,-0.822678,1.121789,-0.119664
2007-05-29,2,13,41.974089,-87.824812,8,1,0,29,5,2007,...,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,-0.822678,1.121789,-0.119664
2007-05-29,3,13,41.974089,-87.824812,8,4,0,29,5,2007,...,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,-0.822678,1.121789,-0.119664


# Baseline KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

X = joined_df.drop('WnvPresent')
y = joined_df.WnvPresent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)


def evaluate_model(estimator, title):
    model = estimator.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    con_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print "Accuracy Score:", acc_score.round(8)
    print
    print "Confusion Matrix:\n", con_matrix
    print
    print "Classification Report:\n", class_report

In [29]:
params = {'n_neighbors':range(1, 300)}

knn = KNeighborsClassifier()

knn_gscv = GridSearchCV(knn, param_grid = params, cv = 5, n_jobs = -1, verbose = 1)
knn_fit = knn_gscv.fit(X, y)
print knn_fit.best_score_
print
print knn_fit.best_estimator_
print
print knn_fit.best_params_

Fitting 5 folds for each of 299 candidates, totalling 1495 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   56.9s


0.947553778793

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=77, p=2,
           weights='uniform')

{'n_neighbors': 77}


[Parallel(n_jobs=-1)]: Done 1495 out of 1495 | elapsed:  2.0min finished
