In [1]:
# Predicting Season

In [None]:
## Load Data

In [101]:
import numpy
from numpy import polyfit
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

from bokeh.io import show
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, RangeTool
from bokeh.plotting import figure

In [99]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("/Users/stewarta/Documents/DATA/iris.data", header=None)
dataframe.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Load Data

In [100]:
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
print(dummy_y)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0.

In [15]:
# # define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=4, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [16]:
# train a model
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

In [17]:
# evaluate the model
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [19]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 97.33% (4.42%)


# MY OWN DATA

In [204]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# parse the DWD dataset and convert date and align with home readings 
# Format dwd: yyyymmddhh
df = pd.read_csv('/Users/stewarta/Documents/DATA/htsensor/produkt_tu_stunde_19510101_20171231_00662.txt', sep=';')
# select relevant columns
df = df[['MESS_DATUM', 'TT_TU', 'RF_TU']]
# rename
df.rename(columns={'TT_TU':'Temp','RF_TU':'Humi'}, inplace=True)

# parse date
df.insert(2, 'datetime', pd.to_datetime(df['MESS_DATUM'], errors='coerce', format='%Y%m%d%H'))

# humidity can not be less than 0
outliers = df[df.Humi < 0].index
df.drop(outliers, axis=0, inplace=True)
df.dropna(axis = 0, inplace=True)

### does not seem to be working....
# interpolate missing readings for humi and temp
#df[['Temp', 'Humi']].fillna(df[['Temp', 'Humi']].mean(), inplace=True)
df[['Temp', 'Humi']] = df[['Temp', 'Humi']].fillna(df.mean())

# create index
df.set_index('datetime', inplace=True)
df.drop('MESS_DATUM', axis = 1, inplace=True)

# filter years
filter = (df.index.year >= 2013) & (df.index.year <= 2016) 

# filter columns
df = df.loc[filter,['Temp', 'Humi']]

# resample
frequency = 'D'
df = df.resample(frequency).mean()

df.head()

Unnamed: 0_level_0,Temp,Humi
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,6.658333,81.0
2013-01-02,5.354167,83.541667
2013-01-03,7.995833,92.083333
2013-01-04,8.958333,92.458333
2013-01-05,7.554167,97.458333


## Infer Target Variables

In [205]:
#Infer target season 
#German Seasons
#SeasonID; Season Name; Month
#2; Spring; March, April, May
#3; Summer; June, July, August
#4; Autumn; September, October, November
#1; Winter; December, January, February
#spring = range(3, 5)
#summer = range(6, 8)
#fall = range(9, 11)
def season(month):
    if month not in range(1,13):
        return np.NaN
        #raise Exception('month must be integer value in interval [1...12] inclusive. The value of x was: {}'.format(month))
    return (month%12 + 3)//3

# if we use the month as a feature, it should be trivial to predict season from month
df['month'] = df.index.month

# target variable: 
df['season'] = df.index.to_series().apply(lambda m : season(m.month))
df.head()

Unnamed: 0_level_0,Temp,Humi,month,season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,6.658333,81.0,1,1
2013-01-02,5.354167,83.541667,1,1
2013-01-03,7.995833,92.083333,1,1
2013-01-04,8.958333,92.458333,1,1
2013-01-05,7.554167,97.458333,1,1


## Display Horizon Chart 

In [103]:
# Change the variable "col" to diplay a different column
col = 'Humi'

dates = df.index.values
source = ColumnDataSource(data=dict(date=dates, close=df[col]))

p = figure(title= col + ": Official Outdoor Observations Braunschweig (City Center) Germany (2013 - 2016)" ,
           plot_height=300, plot_width=800, tools="", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           background_fill_color="#efefef", x_range=(dates[0], dates[df[col].shape[0]-1]))

p.line('date', 'close', source=source)
p.yaxis.axis_label = 'Celcius'

select = figure(title="Drag the middle and edges of the selection box to change the range above",
                plot_height=130, plot_width=800, y_range=p.y_range,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None, background_fill_color="#efefef")

range_tool = RangeTool(x_range=p.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.line('date', 'close', source=source)
select.ygrid.grid_line_color = None
select.add_tools(range_tool)
select.toolbar.active_multi = range_tool

show(column(p, select))

In [206]:
# shuffle the data in the dataframe
df = df.sample(frac=1)
df.head()

Unnamed: 0_level_0,Temp,Humi,month,season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-12-06,3.779167,93.875,12,1
2015-08-16,19.391667,89.0,8,3
2013-03-05,5.1375,65.833333,3,2
2015-11-21,4.291667,84.25,11,4
2016-07-24,22.666667,69.333333,7,3


In [189]:
df.shape

(1461, 4)

In [217]:
dataset = df.values
X = dataset[:, 1:3]
Y = dataset[:,3]

# normalize the data
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X_train = scaler.transform(X)  


# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [222]:
# # define baseline model
def baseline_model():
    # create model : 2 inputs -> [8 hidden nodes] -> 3 outputs
    model = Sequential()
    model.add(Dense(4, input_dim=2, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [224]:
# train a model
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=100, verbose=0)

In [225]:
# evaluate the model
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [226]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 24.71% (3.00%)
