# TOC

# Imports

In [3]:
import pandas as pd
import numpy as np
import scipy as sp

import requests
import json
import time
import datetime

from random import randint
from math import exp

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 1. EDA

In [4]:
data_spray = pd.read_csv('./Data/spray.csv')
data_weather = pd.read_csv('./Data/weather.csv')
data_train = pd.read_csv('./Data/train.csv')
data_test = pd.read_csv('./Data/test.csv')

In [5]:
data_weather['weather_day'] = 1
data_spray['spray_day'] = 1
data_spray.columns = ['Date', 'Time', 'Latitude_sp', 'Longitude_sp', 'spray_day']

## 1.1 Combine Data and Assess

In [7]:
df = pd.concat([data_train, data_weather, data_spray], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [8]:
df['spray_day'].fillna(0,  inplace=True)
df['weather_day'].fillna(0,  inplace=True)

In [9]:
#Convert to Date Time from object
df['Date'] = pd.to_datetime(df['Date'])
#Create Days from start column for easy analysis
BeginDate = datetime.date(2007, 4, 30)
temp = (df['Date']-pd.Timestamp(BeginDate)).astype(str)
df['DateFrom'] = temp.map(lambda x: x.replace(' days 00:00:00.000000000', '')).astype(int)

## 1.2 Feature Analysis

# 2. Data Transformation

### 2.1 Weather Slice

In [12]:
weather_station_1 = data_weather.loc[data_weather.Station==1, :]
weather_station_2 = data_weather.loc[data_weather.Station==2, :]

In [15]:
weather_station_1.drop('Station', axis=1, inplace=True)
weather_station_2.drop('Station', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [16]:
weather_station_1.Date = pd.to_datetime(weather_station_1.Date)
weather_station_2.Date = pd.to_datetime(weather_station_2.Date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### 2.2 Weather Convert Objects

In [17]:
weather_station_1['Tavg'] = weather_station_1['Tavg'].astype('float64');
weather_station_1.drop('Depart', axis=1, inplace=True);
weather_station_1['PrecipTotal'] = weather_station_1['PrecipTotal'].map(lambda x: .001 if x=='  T' else x).astype(float);
weather_station_1['SnowFall'] = weather_station_1['SnowFall'].map(lambda x: .001 if x=='  T' else x).astype(float);
weather_station_1.drop('Water1', axis=1, inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://p

In [18]:
#Create dummy columns for weather codes

#list all columns
templist = [element for element in weather_station_1['CodeSum'].str.split(' ').tolist()]
newl = []
#get all tags
for element in templist:
    for subelement in element:
        newl.append(subelement)
Codes_list = list(set(newl))
Codes_list.pop(0)
#Create new columns and score 1 if present in original CodeSum value
for code in Codes_list:
    weather_station_1[code] = weather_station_1['CodeSum'].str.contains(code)*1
weather_station_1.drop('CodeSum', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
#find means without 'M' values and fill in blank - less than 5 for each
stnpressure_mean = weather_station_1[weather_station_1['StnPressure']!='M']['StnPressure'].astype(float).mean()
sealevel_mean = weather_station_1[weather_station_1['SeaLevel']!='M']['SeaLevel'].astype(float).mean()
wetbulb_mean = weather_station_1[weather_station_1['WetBulb']!='M']['WetBulb'].astype(float).mean()

weather_station_1['StnPressure'] = weather_station_1['StnPressure'].map(lambda x: stnpressure_mean if x=='M' else x).astype(float);
weather_station_1['SeaLevel'] = weather_station_1['SeaLevel'].map(lambda x: sealevel_mean if x=='M' else x).astype(float);
weather_station_1['WetBulb'] = weather_station_1['WetBulb'].map(lambda x: wetbulb_mean if x=='M' else x).astype(float);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
#Convert remaining cols to float. Leftover errors from station 2
obj_cols = [col for col in weather_station_1.columns if weather_station_1[col].dtype=='object']
for col in weather_station_1:
    if col in obj_cols:
        weather_station_1[col] = weather_station_1[col].astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


# 3. Model Fitting

In [11]:
df_formodel = df.loc[:, ['DateFrom', 'Latitude','Longitude', 'WnvPresent']].dropna()
X = df_formodel.loc[:, ['DateFrom','Latitude','Longitude']]
y = df_formodel['WnvPresent']

In [12]:
knn = KNeighborsClassifier()
knn.fit(X, y)
knn.score(X, y)

0.9498381877022654

# 4. Output