In [1]:
# Predicting Season

In [None]:
## Load Data

In [21]:
import numpy as np
from numpy import polyfit
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

from bokeh.io import show
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, RangeTool
from bokeh.plotting import figure

from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

from keras.utils import np_utils

## Load Data

In [22]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# parse the DWD dataset and convert date and align with home readings 
# Format dwd: yyyymmddhh
df = pd.read_csv('/Users/stewarta/Documents/DATA/htsensor/produkt_tu_stunde_19510101_20171231_00662.txt', sep=';')
# select relevant columns
df = df[['MESS_DATUM', 'TT_TU', 'RF_TU']]
# rename
df.rename(columns={'TT_TU':'Temp','RF_TU':'Humi'}, inplace=True)

# parse date
df.insert(2, 'datetime', pd.to_datetime(df['MESS_DATUM'], errors='coerce', format='%Y%m%d%H'))

# humidity can not be less than 0
outliers = df[df.Humi < 0].index
df.drop(outliers, axis=0, inplace=True)
df.dropna(axis = 0, inplace=True)

# fill in missing values
df[['Temp', 'Humi']] = df[['Temp', 'Humi']].fillna(df.mean())

# something is not working with fillna
df.dropna(axis=0, inplace=True)

# create index
df.set_index('datetime', inplace=True)
df.drop('MESS_DATUM', axis = 1, inplace=True)

# filter years
filter = (df.index.year >= 2010) & (df.index.year <= 2017) 

# filter columns
df = df.loc[filter,['Temp', 'Humi']]

# resample
frequency = 'M'
df = df.resample(frequency).mean()

df.head()

Unnamed: 0_level_0,Temp,Humi
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-31,-3.879973,86.928763
2010-02-28,-0.284673,82.645833
2010-03-31,4.991398,75.49328
2010-04-30,9.473333,64.016667
2010-05-31,10.425134,79.571237


## Display Horizon Chart 

In [3]:
# Change the variable "col" to diplay a different column
col = 'Humi'

dates = df.index.values
source = ColumnDataSource(data=dict(date=dates, close=df[col]))

p = figure(title= col + ": Official Outdoor Observations Braunschweig (City Center) Germany (2013 - 2016)" ,
           plot_height=300, plot_width=800, tools="", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           background_fill_color="#efefef", x_range=(dates[0], dates[df[col].shape[0]-1]))

p.line('date', 'close', source=source)
p.yaxis.axis_label = 'Celcius'

select = figure(title="Drag the middle and edges of the selection box to change the range above",
                plot_height=130, plot_width=800, y_range=p.y_range,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None, background_fill_color="#efefef")

range_tool = RangeTool(x_range=p.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.line('date', 'close', source=source)
select.ygrid.grid_line_color = None
select.add_tools(range_tool)
select.toolbar.active_multi = range_tool

show(column(p, select))

## Infer Target Variables

In [23]:
#Infer target season 
#German Seasons
#SeasonID; Season Name; Month
#2; Spring; March, April, May
#3; Summer; June, July, August
#4; Autumn; September, October, November
#1; Winter; December, January, February
#spring = range(3, 5)
#summer = range(6, 8)
#fall = range(9, 11)
def season(month):
    if month not in range(1,13):
        return np.NaN
        #raise Exception('month must be integer value in interval [1...12] inclusive. The value of x was: {}'.format(month))
    return (month%12 + 3)//3

# if we use the month as a feature, it should be trivial to predict season from month
df['month'] = df.index.month

# target variable: 
df['season'] = df.index.to_series().apply(lambda m : season(m.month))
df.head()

Unnamed: 0_level_0,Temp,Humi,month,season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-31,-3.879973,86.928763,1,1
2010-02-28,-0.284673,82.645833,2,1
2010-03-31,4.991398,75.49328,3,2
2010-04-30,9.473333,64.016667,4,2
2010-05-31,10.425134,79.571237,5,2


In [24]:
# seed the data 
seed = 25
# shuffle the data in the dataframe
df = shuffle(df, random_state=seed)
df.head()

Unnamed: 0_level_0,Temp,Humi,month,season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-31,12.784812,76.481183,5,2
2016-02-29,3.772414,80.014368,2,1
2017-08-31,17.84328,74.766129,8,3
2010-07-31,21.364785,63.053763,7,3
2014-07-31,20.45568,70.830295,7,3


In [25]:
# convert integers to dummy variables (i.e. one hot encoded)
Y = to_categorical(df.season)
#print(Y)

In [29]:
X = np_utils.to_categorical(df.month)
print(X[0])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]


In [30]:
clf = RandomForestClassifier(n_estimators=200, random_state=0)
clf.fit(X, Y)
print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]))

[[0. 1. 0. 0. 0.]]


In [26]:
# evaluate the model
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [73]:
results = cross_val_score(clf, X, Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 100.00% (0.00%)


In [27]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

df = df.dropna(axis=0)
X = df[['Temp','Humi']]
scaler.fit(X)  
X_train = scaler.transform(X)  
Y = np_utils.to_categorical(df.season)

clf = RandomForestClassifier(n_estimators=30,  random_state=0)
clf.fit(X, Y)

results = cross_val_score(clf, X_train, Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 86.33% (8.17%)
