In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/weatherAUS.csv')

## NAs

In [3]:
df = df.dropna(how='any')

## Subsetting

In [21]:
df = df.loc[:,['Date','Location','MinTemp','MaxTemp','Evaporation','Sunshine','RainTomorrow']]
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Evaporation,Sunshine,RainTomorrow
5939,2009-01-01,Cobar,17.9,35.2,12.0,12.3,No
5940,2009-01-02,Cobar,18.4,28.9,14.8,13.0,No
5942,2009-01-04,Cobar,19.4,37.6,10.8,10.6,No
5943,2009-01-05,Cobar,21.9,38.4,11.4,12.2,No
5944,2009-01-06,Cobar,24.2,41.0,11.2,8.4,No


In [22]:
some_cities = ['Darwin','Perth','Brisbane','Mildura']
df_some_cities = df[df.Location.isin(some_cities)]
df_some_cities.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Evaporation,Sunshine,RainTomorrow
68180,2009-01-01,Mildura,13.8,27.4,9.6,12.6,No
68182,2009-01-03,Mildura,10.6,28.6,9.2,13.6,No
68183,2009-01-04,Mildura,13.2,34.5,8.8,13.5,No
68184,2009-01-05,Mildura,16.5,37.3,10.4,13.4,No
68185,2009-01-06,Mildura,15.7,39.2,13.4,13.4,No


In [23]:
df_low_min = df[df.MinTemp < 10]
df_low_min.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Evaporation,Sunshine,RainTomorrow
6013,2009-03-16,Cobar,9.2,24.4,7.6,10.9,No
6033,2009-04-05,Cobar,9.9,24.6,7.0,11.2,No
6035,2009-04-07,Cobar,9.4,25.6,7.6,11.2,No
6055,2009-04-27,Cobar,5.7,19.7,4.8,10.3,No
6057,2009-04-29,Cobar,6.2,17.0,4.0,9.6,No


## Date

In [94]:
df.Date = pd.to_datetime(df.Date)
df['Week'] = df.Date.dt.week
df['Month'] = df.Date.dt.month
df['Year'] = df.Date.dt.year

## Group By

In [95]:
df_summary = df.groupby('Location').agg({'MinTemp':'mean',
                            'MaxTemp':'mean',
                            'Date':'count'
                           }).reset_index()

In [96]:
df_summary = df_summary.rename(columns={'MinTemp':'LocMinTemp',
                           'MaxTemp':'LocMaxTemp',
                           'Date':'Count'})

In [97]:
df_summary.head()

Unnamed: 0,Location,LocMinTemp,LocMaxTemp,Count
0,AliceSprings,13.878093,29.724921,2223
1,Brisbane,16.365628,26.43681,2953
2,Cairns,21.129296,29.551596,2444
3,Canberra,7.727829,20.363822,1078
4,Cobar,13.82809,26.225281,534


## Join

In [98]:
df_merged = df.merge(df_summary, on='Location', how='left')
df_merged.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Evaporation,Sunshine,RainTomorrow,Week,Month,Year,LocMinTemp,LocMaxTemp,Count
0,2009-01-01,Cobar,17.9,35.2,12.0,12.3,0,1,1,2009,13.82809,26.225281,534
1,2009-01-02,Cobar,18.4,28.9,14.8,13.0,0,1,1,2009,13.82809,26.225281,534
2,2009-01-04,Cobar,19.4,37.6,10.8,10.6,0,1,1,2009,13.82809,26.225281,534
3,2009-01-05,Cobar,21.9,38.4,11.4,12.2,0,2,1,2009,13.82809,26.225281,534
4,2009-01-06,Cobar,24.2,41.0,11.2,8.4,0,2,1,2009,13.82809,26.225281,534


## Concat

In [99]:
df_1 = df.iloc[0:100,:]
df_2 = df.iloc[100:200,:]
df_concat = pd.concat([df_1,df_2],axis=0)

## Transform Covariates

In [100]:
## apply

def to_farenheit(C):
    return ((9/5)*C)+32

df_faren = df[['MinTemp','MaxTemp']].apply(to_farenheit)
df_temps = pd.concat([df[['MinTemp','MaxTemp']],df_faren],axis=1)
df_temps.columns = [x+'C' for x in ['MinTemp','MaxTemp']] + [x+'F' for x in ['MinTemp','MaxTemp']]
df_temps.head()

Unnamed: 0,MinTempC,MaxTempC,MinTempF,MaxTempF
5939,17.9,35.2,64.22,95.36
5940,18.4,28.9,65.12,84.02
5942,19.4,37.6,66.92,99.68
5943,21.9,38.4,71.42,101.12
5944,24.2,41.0,75.56,105.8


In [109]:
## map
binary_outcome = {'Yes':1, 'No':0}
df.RainTomorrow = df.RainTomorrow.map(binary_outcome)

In [110]:
df.Month.value_counts()

1     5267
3     5174
5     4896
10    4811
8     4728
11    4717
7     4709
9     4638
6     4565
4     4422
12    4264
2     4229
Name: Month, dtype: int64

In [111]:
## zscore
from scipy.stats import zscore
MinTemp_zscore = zscore(df.MinTemp)
np.histogram(MinTemp_zscore)

(array([   53,   849,  3934,  9239, 11544, 10991,  9410,  6847,  3387,
          166]),
 array([-3.14257851, -2.54880906, -1.95503961, -1.36127016, -0.76750071,
        -0.17373127,  0.42003818,  1.01380763,  1.60757708,  2.20134653,
         2.79511598]))

## Reshape

In [92]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Evaporation,Sunshine,RainTomorrow
5939,2009-01-01,Cobar,17.9,35.2,12.0,12.3,0
5940,2009-01-02,Cobar,18.4,28.9,14.8,13.0,0
5942,2009-01-04,Cobar,19.4,37.6,10.8,10.6,0
5943,2009-01-05,Cobar,21.9,38.4,11.4,12.2,0
5944,2009-01-06,Cobar,24.2,41.0,11.2,8.4,0


In [125]:
df['Season'] = (df.Month%12 + 3)//3
df_season = df.groupby(['Location','Season'])[['MinTemp']].mean().reset_index()
df_season.head()

Unnamed: 0,Location,Season,MinTemp
0,AliceSprings,1,20.847245
1,AliceSprings,2,13.461041
2,AliceSprings,3,5.119798
3,AliceSprings,4,14.565385
4,Brisbane,1,21.235242


In [141]:
## Pivot Table

df_season2 = df_season.pivot(index='Location', columns='Season', values='MinTemp')
df_season2.head()

Season,1,2,3,4
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AliceSprings,20.847245,13.461041,5.119798,14.565385
Brisbane,21.235242,17.301103,11.23776,16.293453
Cairns,23.946405,21.974756,17.84895,20.77429
Canberra,13.668966,7.684026,1.883333,7.473413
Cobar,20.907692,13.536471,6.551111,14.055814


In [173]:
df_season2.head()

Season,Location,1,2,3,4
0,AliceSprings,20.847245,13.461041,5.119798,14.565385
1,Brisbane,21.235242,17.301103,11.23776,16.293453
2,Cairns,23.946405,21.974756,17.84895,20.77429
3,Canberra,13.668966,7.684026,1.883333,7.473413
4,Cobar,20.907692,13.536471,6.551111,14.055814
