In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [34]:
# Create path for train.csv and weather.csv

path_train = '/Users/michaelshea/desktop/class/WestNile/WestNilePrediction/Assets/train.csv'
path_weather = '/Users/michaelshea/desktop/class/WestNile/WestNilePrediction/Assets/weather.csv'

In [35]:
# Read in dataframes

train = pd.read_csv(path_train)
weather = pd.read_csv(path_weather)

In [57]:
# Dtypes and columns of train_df 

print train.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object


In [56]:
# Dtypes and columns of weather_df

print weather.dtypes

Station          int64
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object


In [41]:
# Convert weather Date column to datetime

weather['Date'] = pd.to_datetime(weather.Date)

In [45]:
# Set weather index to Date

weather.set_index('Date', inplace=True)

In [61]:
weather.head(2)

Unnamed: 0_level_0,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-01,1,83,50,67,14,51,56,0,2,0448,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
2007-05-01,2,84,52,68,M,51,57,0,3,-,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [80]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [55]:
# Find Date range of weather data

print weather.index.min()
print weather.index.max()

2007-05-01 00:00:00
2014-10-31 00:00:00


In [60]:
# Was weather measured every day? Every weekday?
# According to internet, there were 2741 days between the dates above, 1886 excluding weekends and public holidays

# This proves weather not measured everyday (just something to note going forward):
print 'len(weather.index.unique())

1472

In [71]:
# Since test set includes 2008, 2010, 2012, and 2014, pull out these years from weather

weather = weather[(weather.index.year == 2007) | (weather.index.year == 2009) | (weather.index.year == 2011) | (weather.index.year == 2013)]

In [77]:
# Demonstrate above code worked:

print np.unique(weather.index.year)

[2007 2009 2011 2013]


Since there are two weather stations, it might make sense to calculate the distance between each trap and the nearest weather station, and then use that station's weather data when we merge. The following link is what I found when I googled "calculate distance between two points latitude longitude python": http://www.johndcook.com/blog/python_longitude_latitude/

It uses this code:

In [82]:
import math
 
def distance_on_unit_sphere(lat1, long1, lat2, long2):

    # Convert latitude and longitude to spherical coordinates in radians
    degrees_to_radians = math.pi/180.0
 
    # phi = 90 - latitude
    phi1 = (90.0 - lat1)*degrees_to_radians
    phi2 = (90.0 - lat2)*degrees_to_radians
 
    # theta = longitude
    theta1 = long1*degrees_to_radians
    theta2 = long2*degrees_to_radians
 
    # Compute spherical distance from spherical coordinates.
 
    # For two locations in spherical coordinates
    # (1, theta, phi) and (1, theta', phi')
    # cosine( arc length ) =
    # sin phi sin phi' cos(theta-theta') + cos phi cos phi'
    # distance = rho * arc length
 
    cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
    math.cos(phi1)*math.cos(phi2))
    arc = math.acos( cos )

    # Remember to multiply arc by the radius of the earth
    # in your favorite set of units to get length.
    return arc

In [85]:
# We could add two new columns to the train dataframe, dist_to_station_1 and dist_to_station_2

station_1_lat = 41.995
station_1_lon = -87.933
station_2_lat = 41.786
station_2_lon = -87.752
dist_to_station_1 = []
dist_to_station_2 = []

for i in range(len(train.index)):
    one = distance_on_unit_sphere(train.ix[i, 'Latitude'], train.ix[i, 'Longitude'], station_1_lat, station_1_lon)
    two = distance_on_unit_sphere(train.ix[i, 'Latitude'], train.ix[i, 'Longitude'], station_2_lat, station_2_lon)
    dist_to_station_1.append(one)
    dist_to_station_2.append(two)

In [87]:
# Add distances to weather stations to train dataframe

train['dist_to_station_1'] = dist_to_station_1
train['dist_to_station_2'] = dist_to_station_2