In [2]:
#-- Loading: Training Dataset.

fids = glob('./weather_data/*csv'); fids.sort()
df = pd.DataFrame()
#-- Selected Columns.
#-- The reason why the training dataset only select the vars below is because
#--     1. They are common vars happened in the dataset among years
#--     2. But elimite the 'EVAP' because the numbers of nan value is varied. 
jointCols=['Lat', 'Lon', 'AWND', 'PRCP', 'SNOW', 'SNWD', 
           'TAVG', 'TMAX', 'TMIN', 'TOBS', 'WESD', 'WT01', 'WT02', 
           'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08', 'WT11', 'date']
for fid in fids:
    year = fid.split('/')[2].split('_')[0]
    if int(year) >= 1992 and int(year) <=2015:
        dd = pd.read_csv(fid,header=0)
        dd['date'] = pd.to_datetime(year+ dd.month.astype('str'), format='%Y%m')  
        df = df.append(dd[jointCols])

df = df.set_index('date')
df.index = df.index.strftime('%Y-%m')        

In [3]:
#-- Loading: Target Dataset.
conn = sqlite3.connect('./FPA_FOD_20170508.sqlite')
query = "SELECT * From Fires"
target= pd.read_sql_query(query, conn)

In [4]:
target = target[target['STATE'] == 'CA']
target = target[['DISCOVERY_DOY',
                 'DISCOVERY_DATE',
                 'LATITUDE',
                 'LONGITUDE',
                 'FIRE_YEAR',
                 'FIRE_SIZE_CLASS',
                 'STAT_CAUSE_DESCR']].drop_duplicates().dropna()
target = target.rename(columns={'DISCOVERY_DATE': 'DISCOVERY_DATE_julian'})
target['DISCOVERY_DATE'] = pd.to_datetime(target['DISCOVERY_DATE_julian'] - pd.Timestamp(0).to_julian_date(), unit='D')
target = target.groupby([pd.Grouper(key='DISCOVERY_DATE', freq='1M'),pd.Grouper(key='FIRE_SIZE_CLASS')]).size().unstack(level=1).fillna(0)
target.index = target.index.strftime('%Y-%m')    
target = target.loc['1992-01':'2015-12']

In [5]:
#-- Train/Target : np.array()
train = [ df.loc[date].values for date in np.unique(df.index.values)]    
x_train, x_test, y_train, y_test = train_test_split(train, target.values, test_size=0.3, random_state=42)

In [6]:
#--- Hidden Layer's Node by Empirical calculation.
print('Empirical Nodes: [{:f}, {:f}]'.format(np.sqrt(20*7), np.sqrt(2*(7+1)*201)))

Empirical Nodes: [11.832160, 56.709788]


---
---
**Deep Learning**, Input + Conv2D + Conv2D + Conv2D + MaxPool2D + Dense x 3 + Output

In [7]:
def base_model():
    model = keras.Sequential()
    model.add(preprocessing.Normalization(input_shape=[4410,20,1]))
    model.add(layers.Conv2D(128,5,strides=1,activation='relu'))
    model.add(layers.Conv2D(64,5,strides=1,activation='relu'))    
    model.add(layers.Conv2D(32,5,strides=1,activation='relu'))        
    model.add(layers.MaxPool2D(3))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='tanh'))
    model.add(layers.Dense(32))
    model.add(layers.Dense(16))    
#     model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32))
#     model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7))
    return model

In [8]:
model1, model2 = base_model(), base_model()
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 4410, 20, 1)       3         
_________________________________________________________________
conv2d (Conv2D)              (None, 4406, 16, 128)     3328      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 4402, 12, 64)      204864    
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4398, 8, 32)       51232     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 1466, 2, 32)       0         
_________________________________________________________________
flatten (Flatten)            (None, 93824)             0         
_________________________________________________________________
dense (Dense)                (None, 64)                6

In [9]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

#-- Optimizer = Adam
model1.compile(optimizer='adam',loss='mse',metrics=['mae','mse'])

history = model1.fit(np.expand_dims(x_train,axis=-1), 
                     np.array(y_train), epochs=10, 
                     validation_split=0.2, 
                     verbose=2, 
                     callbacks=[early_stop])

Epoch 1/10
5/5 - 56s - loss: 44617.4102 - mae: 93.8849 - mse: 44617.4102 - val_loss: 51208.7070 - val_mae: 95.2677 - val_mse: 51208.7070
Epoch 2/10
5/5 - 65s - loss: 43997.9844 - mae: 93.6952 - mse: 43997.9844 - val_loss: 50718.5781 - val_mae: 94.8659 - val_mse: 50718.5781
Epoch 3/10
5/5 - 59s - loss: 43480.8086 - mae: 93.2608 - mse: 43480.8086 - val_loss: 50069.5000 - val_mae: 94.3116 - val_mse: 50069.5039
Epoch 4/10
5/5 - 59s - loss: 42772.6797 - mae: 92.6071 - mse: 42772.6797 - val_loss: 49229.9102 - val_mae: 93.5100 - val_mse: 49229.9102
Epoch 5/10
5/5 - 61s - loss: 41821.1523 - mae: 91.6195 - mse: 41821.1523 - val_loss: 48156.3125 - val_mae: 92.4185 - val_mse: 48156.3125
Epoch 6/10
5/5 - 61s - loss: 40641.8984 - mae: 90.2099 - mse: 40641.8984 - val_loss: 46771.2852 - val_mae: 90.9558 - val_mse: 46771.2852
Epoch 7/10
5/5 - 60s - loss: 39161.8750 - mae: 88.1436 - mse: 39161.8750 - val_loss: 44995.7383 - val_mae: 88.9362 - val_mse: 44995.7383
Epoch 8/10
5/5 - 59s - loss: 37194.7461 -

In [10]:
y_pred = model1.predict(np.expand_dims(x_test,axis=-1))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 76.4205920105595
Mean Squared Error: 30781.42270323913
Root Mean Squared Error: 175.44635277838958


In [11]:
np.nansum(y_pred)

18213.084

Because   
 1. the nansum(prediction) is equal to 0, this simple conv2d will take more time to tune on it. 
 2. the dataset is not supporting to the deep learning approach as well.
 
Decide to **stop** here. 