In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('/Users/TerryONeill/west_nile/west_nile/assets/train_weather_join.csv')

In [4]:
## here are just a few ways to familiarize ourselves with the dataframe

print 'Shape of Dataframe:\n', df.shape
print
print 'Datatypes of Columns:\n', df.dtypes
print
print 'Null Values for Each Column:\n', df.isnull().sum()

Shape of Dataframe:
(10506, 21)

Datatypes of Columns:
Date                object
Species              int64
Trap                 int64
Latitude           float64
Longitude          float64
AddressAccuracy      int64
NumMosquitos         int64
WnvPresent           int64
day                  int64
month                int64
year                 int64
Tmax               float64
Tmin               float64
Tavg               float64
Depart             float64
DewPoint           float64
PrecipTotal        float64
Sunrise            float64
Sunset             float64
month_weather      float64
day_weather        float64
dtype: object

Null Values for Each Column:
Date               0
Species            0
Trap               0
Latitude           0
Longitude          0
AddressAccuracy    0
NumMosquitos       0
WnvPresent         0
day                0
month              0
year               0
Tmax               0
Tmin               0
Tavg               0
Depart             0
DewPoint           

In [5]:
## checking out the head of the dataframe
df.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,...,Tmax,Tmin,Tavg,Depart,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather
0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,179,5,...,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,5.0,179.0
1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,179,5,...,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,5.0,179.0
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,5,...,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,5.0,179.0
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,5,...,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,5.0,179.0
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,5,...,88.0,62.5,75.5,10.0,58.5,0.0,421.0,1917.0,5.0,179.0


In [6]:
## I am listing specifically the weather columns that I want to perform PCA on so I can
## make a dataframe of just these columns
pca_cols = ['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'month_weather', 'day_weather']

## creating the dataframe with the specified columns
pca_prep = df[pca_cols]
pca_prep.head()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,DewPoint,month_weather,day_weather
0,88.0,62.5,75.5,10.0,58.5,5.0,179.0
1,88.0,62.5,75.5,10.0,58.5,5.0,179.0
2,88.0,62.5,75.5,10.0,58.5,5.0,179.0
3,88.0,62.5,75.5,10.0,58.5,5.0,179.0
4,88.0,62.5,75.5,10.0,58.5,5.0,179.0


In [7]:
from sklearn import preprocessing


##Standardizing all the columns
Xstand = preprocessing.StandardScaler().fit_transform(pca_prep)
Xstand

array([[ 0.73533201, -0.15323081,  0.33197309, ..., -0.15032138,
        -2.52200851, -2.16470087],
       [ 0.73533201, -0.15323081,  0.33197309, ..., -0.15032138,
        -2.52200851, -2.16470087],
       [ 0.73533201, -0.15323081,  0.33197309, ..., -0.15032138,
        -2.52200851, -2.16470087],
       ..., 
       [-0.83617307, -1.48892293, -1.20894883, ..., -0.98269898,
         1.22462913,  1.60879562],
       [-0.83617307, -1.48892293, -1.20894883, ..., -0.98269898,
         1.22462913,  1.60879562],
       [-0.83617307, -1.48892293, -1.20894883, ..., -0.98269898,
         1.22462913,  1.60879562]])

In [8]:
## creating the covariance matrix - this explains the variance between the different
## features within our dataframe

## for example, the value in the i,j position within the matrix explains the variance
## between the ith and the jth elements of a random vector, or between our features
cov_mat = np.cov(Xstand.T)

## creating my eigenvalues and corresponding eigenvectors
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

In [9]:
print eigenValues 
print
print
print eigenVectors 
## each eigenvector has 8 components - stemming from 8 total features

[  4.31191511e+00   2.06173206e+00   3.45251431e-01   1.49400131e-01
   9.59255074e-02   2.70656744e-04   3.61714608e-02]


[[ 0.4462216   0.04926796  0.5382843  -0.37253578  0.40098279 -0.44876261
   0.08803962]
 [ 0.44937231  0.09876047 -0.42814283 -0.25024524 -0.61743087 -0.40137224
  -0.00783516]
 [ 0.47361977  0.07636571  0.08609183 -0.34100656 -0.08410185  0.79841381
   0.04021914]
 [ 0.42505546 -0.1939785   0.40319188  0.72587161 -0.29769264  0.00462452
  -0.06003848]
 [ 0.43763618  0.0633152  -0.57909405  0.32206294  0.59977796  0.00130228
  -0.07520643]
 [ 0.03277161 -0.68601013 -0.03929831 -0.22037507  0.04236158 -0.00448052
  -0.69021157]
 [ 0.03692672 -0.6853765  -0.1416826  -0.05594312  0.0277452   0.00216112
   0.71057541]]


In [10]:
## creating the eigenpairs - just pairing the eigenvalue with its eigenvector
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]

## sort in ascending order and then reverse to descending (for clarification's sake)
eigenPairs.sort()
eigenPairs.reverse()

## loop through the eigenpairs and printing out the first row (eigenvalue)
## this is also seen in the code block above but just wanted to loop through again
## as it is a bit more clear like this
## I am also creating a list of the eigenvalues in ascending order to be able to reference it
sort_values = []
for i in eigenPairs:
    print i[0]
    sort_values.append(i[0])
    
sort_values

4.31191510665
2.06173205605
0.34525143086
0.14940013089
0.0959255073613
0.0361714608117
0.000270656744141


[4.3119151066452091,
 2.0617320560458707,
 0.34525143085954041,
 0.14940013088967524,
 0.095925507361295334,
 0.036171460811714726,
 0.00027065674414105507]

In [11]:
## we have the eigenvalues above showing us feature correlation explanation, but it helps
## to see the cumulative variance explained as well, which i can show below

## need to sum the eigen values to get percentages
sumEigenvalues = sum(eigenValues)

## this is a percentage explanation
variance_explained = [(i/sumEigenvalues)*100 for i in sort_values]
variance_explained

[61.592924037023657,
 29.450511610728398,
 4.9316938364192886,
 2.1340844347393855,
 1.3702339545163413,
 0.5166859696867957,
 0.0038661568861355204]

In [12]:
### based on the above results, it seems that sticking to 3 features would be ideal
## this can very easily be manipulated by changing n_components adn then adding/subtracting
## columns to the dataframe in the code block below

## instantiate
pca = PCA(n_components = 4)

## fit and transform the standardized data
pca_cols = pca.fit_transform(Xstand)


In [13]:
## going to organize the columns into dataframe for organization
pca_df = pd.DataFrame(pca_cols, columns = ['pca1', 'pca2', 'pca3', 'pca4'])

##previewing dataframe
print pca_df.shape
pca_df.head()

(10506, 4)


Unnamed: 0,pca1,pca2,pca3,pca4
0,-0.663872,3.033573,-1.434144,1.092123
1,-0.663872,3.033573,-1.434144,1.092123
2,-0.663872,3.033573,-1.434144,1.092123
3,-0.663872,3.033573,-1.434144,1.092123
4,-0.663872,3.033573,-1.434144,1.092123


In [14]:
## Here I am joining the initial dataframe and the newly created pca dataframe
## together to move forward with

joined_pca = df.join(pca_df)

In [15]:
## checking on the dataframe
joined_pca

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,2007-05-29,2,1,41.954690,-87.800991,9,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
1,2007-05-29,3,1,41.954690,-87.800991,9,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
5,2007-05-29,3,32,41.921600,-87.666455,8,2,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
6,2007-05-29,3,33,41.891118,-87.654491,8,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
7,2007-05-29,2,35,41.867108,-87.654224,8,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
8,2007-05-29,3,35,41.867108,-87.654224,8,2,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
9,2007-05-29,3,36,41.896282,-87.655232,8,1,0,179,5,...,58.5,0.000,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123


In [16]:
## Now that I have the PCA columns added on, I want to change the index to the date

## first i need to change the column to date time format
joined_pca['Date'] = pd.to_datetime(joined_pca['Date'])

## Now I set Date as the index. Inplace=True will modify the original object

## commenting the remaking of the index for now to keep a numeric index

##joined_pca.set_index('Date', drop = True, inplace = True)

In [17]:
## checking out the head
joined_pca.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123


# Baseline KNN Model

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

X = joined_pca.drop('WnvPresent', axis = 1)
y = joined_pca.WnvPresent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

knn = KNeighborsClassifier()

def evaluate_model(estimator, title):
    model = estimator.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    con_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print "Accuracy Score:", acc_score.round(8)
    print
    print "Confusion Matrix:\n", con_matrix
    print
    print "Classification Report:\n", class_report

In [20]:
evaluate_model(knn, 'KNN')

TypeError: float() argument must be a string or a number

In [None]:
params = {'n_neighbors':range(1, 300)}

knn = KNeighborsClassifier()

knn_gscv = GridSearchCV(knn, param_grid = params, cv = 5, n_jobs = -1, verbose = 1)
knn_fit = knn_gscv.fit(X, y)
print knn_fit.best_score_
print
print knn_fit.best_estimator_
print
print knn_fit.best_params_

In [None]:
## now i am going to export the dataframe as a csv file

joined_pca.to_csv('/Users/TerryONeill/west_nile/west_nile/assets/joined_pca.csv')

In [21]:
joined_pca.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,month,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123
4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,5,...,58.5,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123


# Test Data PCA

In [23]:
final_df = pd.read_csv('/Users/TerryONeill/west_nile/west_nile/assets/final_df.csv')
print final_df.shape
final_df.head()

(116293, 19)


Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,day,month,year,Tmax,Tmin,Tavg,Depart,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather
0,2008-06-11,2,1,41.95469,-87.800991,9,191,6,2008,86.0,63.5,75.0,7.0,55.5,0.0,416.0,1926.0,6.0,191.0
1,2008-06-11,3,1,41.95469,-87.800991,9,191,6,2008,86.0,63.5,75.0,7.0,55.5,0.0,416.0,1926.0,6.0,191.0
2,2008-06-11,1,1,41.95469,-87.800991,9,191,6,2008,86.0,63.5,75.0,7.0,55.5,0.0,416.0,1926.0,6.0,191.0
3,2008-06-11,4,1,41.95469,-87.800991,9,191,6,2008,86.0,63.5,75.0,7.0,55.5,0.0,416.0,1926.0,6.0,191.0
4,2008-06-11,6,1,41.95469,-87.800991,9,191,6,2008,86.0,63.5,75.0,7.0,55.5,0.0,416.0,1926.0,6.0,191.0


In [24]:
## I am listing specifically the weather columns that I want to perform PCA on so I can
## make a dataframe of just these columns
pca_cols = ['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'month_weather', 'day_weather']

## creating the dataframe with the specified columns
pca_prep = final_df[pca_cols]
pca_prep.head()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,DewPoint,month_weather,day_weather
0,86.0,63.5,75.0,7.0,55.5,6.0,191.0
1,86.0,63.5,75.0,7.0,55.5,6.0,191.0
2,86.0,63.5,75.0,7.0,55.5,6.0,191.0
3,86.0,63.5,75.0,7.0,55.5,6.0,191.0
4,86.0,63.5,75.0,7.0,55.5,6.0,191.0


In [25]:
from sklearn import preprocessing


##Standardizing all the columns
Xstand = preprocessing.StandardScaler().fit_transform(pca_prep)
Xstand

array([[ 0.63932333,  0.16897014,  0.45734427, ..., -0.28246883,
        -1.52016095, -1.69224449],
       [ 0.63932333,  0.16897014,  0.45734427, ..., -0.28246883,
        -1.52016095, -1.69224449],
       [ 0.63932333,  0.16897014,  0.45734427, ..., -0.28246883,
        -1.52016095, -1.69224449],
       ..., 
       [-1.03768059,  0.33254875, -0.41183481, ...,  0.63250412,
         2.23275833,  1.79951435],
       [-1.03768059,  0.33254875, -0.41183481, ...,  0.63250412,
         2.23275833,  1.79951435],
       [-1.03768059,  0.33254875, -0.41183481, ...,  0.63250412,
         2.23275833,  1.79951435]])

In [26]:
## creating the covariance matrix - this explains the variance between the different
## features within our dataframe

## for example, the value in the i,j position within the matrix explains the variance
## between the ith and the jth elements of a random vector, or between our features
cov_mat = np.cov(Xstand.T)

## creating my eigenvalues and corresponding eigenvectors
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

In [27]:
print eigenValues 
print
print
print eigenVectors 
## each eigenvector has 8 components - stemming from 8 total features

[  4.25488075e+00   1.83570925e+00   5.37129564e-01   2.13315518e-01
   1.21931539e-01   5.46721529e-04   3.65468469e-02]


[[ 0.44645204  0.01298167  0.41095158  0.33778255  0.54620983 -0.4660478
   0.04458778]
 [ 0.43988522  0.12591848 -0.37519719  0.34665762 -0.61449392 -0.38227254
  -0.07724003]
 [ 0.47536661  0.06900442  0.06213245  0.35754269  0.02434075  0.79786172
  -0.01979812]
 [ 0.36841678  0.31067309  0.53289344 -0.60813005 -0.3375205  -0.0051221
  -0.00272046]
 [ 0.39227236  0.17964574 -0.63392084 -0.46869864  0.43688102 -0.00223589
   0.03802287]
 [-0.22026875  0.6474918   0.00393109  0.12834297  0.13626756 -0.00423655
  -0.70509707]
 [-0.20756036  0.65664177 -0.02527681  0.17866778  0.01027713  0.00588657
   0.7021673 ]]


In [28]:
## creating the eigenpairs - just pairing the eigenvalue with its eigenvector
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]

## sort in ascending order and then reverse to descending (for clarification's sake)
eigenPairs.sort()
eigenPairs.reverse()

## loop through the eigenpairs and printing out the first row (eigenvalue)
## this is also seen in the code block above but just wanted to loop through again
## as it is a bit more clear like this
## I am also creating a list of the eigenvalues in ascending order to be able to reference it
sort_values = []
for i in eigenPairs:
    print i[0]
    sort_values.append(i[0])
    
sort_values

4.25488075171
1.83570925255
0.537129563578
0.213315518045
0.121931538982
0.036546846912
0.000546721528772


[4.2548807517062448,
 1.8357092525536614,
 0.53712956357839514,
 0.21331551804492979,
 0.12193153898248636,
 0.036546846912020849,
 0.00054672152877229571]

In [29]:
## we have the eigenvalues above showing us feature correlation explanation, but it helps
## to see the cumulative variance explained as well, which i can show below

## need to sum the eigen values to get percentages
sumEigenvalues = sum(eigenValues)

## this is a percentage explanation
variance_explained = [(i/sumEigenvalues)*100 for i in sort_values]
variance_explained

[60.783488058785281,
 26.224192390645079,
 7.6732134973925064,
 3.0473383393031828,
 1.7418641499551368,
 0.52209332352551929,
 0.0078102403932908083]

In [30]:
### based on the above results, it seems that sticking to 3 features would be ideal
## this can very easily be manipulated by changing n_components adn then adding/subtracting
## columns to the dataframe in the code block below

## instantiate
pca = PCA(n_components = 4)

## fit and transform the standardized data
pca_cols = pca.fit_transform(Xstand)


In [31]:
## going to organize the columns into dataframe for organization
pca_df = pd.DataFrame(pca_cols, columns = ['pca1', 'pca2', 'pca3', 'pca4'])

##previewing dataframe
print pca_df.shape
pca_df.head()

(116293, 4)


Unnamed: 0,pca1,pca2,pca3,pca4
0,-1.514379,-1.779891,-0.967131,0.524445
1,-1.514379,-1.779891,-0.967131,0.524445
2,-1.514379,-1.779891,-0.967131,0.524445
3,-1.514379,-1.779891,-0.967131,0.524445
4,-1.514379,-1.779891,-0.967131,0.524445


In [35]:
## Here I am joining the initial dataframe and the newly created pca dataframe
## together to move forward with

final_joined_pca = final_df.join(pca_df)

In [36]:
print joined_pca.shape
final_joined_pca.head()

(116293, 23)


Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,day,month,year,Tmax,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,2008-06-11,2,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
1,2008-06-11,3,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
2,2008-06-11,1,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
3,2008-06-11,4,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
4,2008-06-11,6,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445


In [37]:
## now i am going to export the dataframe as a csv file

final_joined_pca.to_csv('/Users/TerryONeill/west_nile/west_nile/assets/final_joined_pca.csv')