# Task

### Create a feed forward neural network which is capable of predicting the the mean temperature in Budapest 1, 7 and 28 days in advance.

##### Resources
I used two sources for the data

- [met.hu](https://www.met.hu/eghajlat/magyarorszag_eghajlata/eghajlati_adatsorok/Budapest/adatok/napi_adatok/): data from january 1901 to december 2019. This was the data I used to train my neural network. I could download this data as a .csv file. Due to the website being hungarian, the data was separated by a semicolon.

- [metnet.hu](https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=2020-10-22): data from august 2009 to today. There wasn't a download option for this website, so I scraped the data with beautifulsoup.

In [44]:
# import statements for weather scraper

import bs4 as bs
import datetime as dt
import requests
import re
import os.path
from os import path

Data from [metnet.hu](https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=2020-10-22) can be easily scraped. The url looks like this: https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=2020-10-22. All the scraper has to do is insert a new date at the end of the url every month. It's not needed to do every day, because the data for the whole month is organized in a table, so a url ending in `2020-10-22` and `2020-10-23` yields the same website.

In [45]:
class WeatherScraper:
  '''class for scraping weather data from the internet, configured for metnet.hu'''

    def __init__(self, last_date, url, data_file):
        self.url = url  # website url
        self.data_file = data_file  # file to save data to
        
        # check wether data file contains data,
        # if it does, then use the latest date, if not, then use the date given in params
        self.last_date = self.get_last_date_from_file() or last_date   
        
        # date when function was run, saved to variable, so problems don't occur when running func at midnight
        self.today = dt.date.today()  
        
        # list that contains the scraped data
        self.weather_data_list = []
        

    def run(self):  # main function of the class
      '''scrape weather data from website, if done or error, save to file'''
        prev_date = dt.date(9999, 1, 1)  # set year to 9999, so it will always be overwritten
        
        for date in self.get_dates():  # dates in date generator
            if (prev_date.strftime('%Y%m') != date.strftime('%Y%m')):   # if new month
                prev_date = date
                url = f"{self.url}{date.strftime('%Y-%m-01')}"          # create new url
                source = requests.get(url).text                         # get html source
                soup = bs.BeautifulSoup(source, "lxml")                 # create soup
                weather_table = soup.find("table", {"class": "table"})  # find table in website
            
            date_str = date.strftime('%d.').lstrip('0')  # format number into a the format used in the table
            # find parent of the cell containing the date, this gives a row
            weather_table_row = weather_table.find("td", text=date_str).parent  
            
            # initialize variables
            temp_count = 0
            avg_temp = 0
            for temp in weather_table_row.find_all('span', text=re.compile(r' °C$')):  # find cells containing '°C'
                temp_count += 1
                # replace comma in float with period and add to avg_temp
                avg_temp += float(temp.text[:-3].replace(',', '.'))  
            
            if(temp_count == 2):  # if the scraper found two temps, then everything is fine
                avg_temp /= 2
                self.weather_data_list.append([date.strftime('%Y-%m-%d'), avg_temp])
            else:  # else print error message and stop process
                avg_temp = None
                print(f"{date.strftime('%Y-%m-%d')}: Could not calculate average temp")
                break
        self.update_data()  # write scraped data to file
        
    
    def get_dates(self):
      '''generate dates from the (last date, current date], last date excluded, current date included'''
        i = 0
        d1 = self.last_date
        d2 = self.today
        date_diff = abs((d2 - d1).days)

        while(self.last_date < self.today):  # generate dates
            i += 1
            if(not i%((date_diff//20)+1) or i == date_diff):
                print(f'{i}/{date_diff}\t{i*100/date_diff:.2f}%')  # print progress

            self.last_date += dt.timedelta(days=1)  # next date
            yield self.last_date
           

    def get_last_date_from_file(self):
      '''retrieve last date from csv file'''
        if(path.isfile(self.data_file)):
            with open(self.data_file, 'r') as data_file:  # open file
                lines = data_file.read().splitlines()     # create list of lines
                last_data = lines[-1]                     # get last line

            last_data_list = last_data.split(',')         # split with separator
            last_date = last_data_list[0]                 # get date from position 0
            if(last_date == 'date'):                      # if file only contains the header then return None
                return None
            return dt.datetime.strptime(last_date, '%Y-%m-%d').date()  # else return date
        else:
            with open(self.data_file, 'w') as data_file:  # else create new file containing only the headers
                data_file.write('date,avg_temp')
            return None


    def update_data(self):
      '''organize scraped data in a format consistent with a csv file'''
        separator = '\n'
        rows = self.get_rows_as_string()
        write_string = separator.join(rows)  # join rows with a newline between them
        with open(self.data_file, 'a', encoding="utf-8") as data_file:  # open data file
            data_file.write("\n" + write_string)  # append data to file
        
        
    def get_rows_as_string(self):
      '''format date and average temperature as string, seperated by comma'''
        separator = ','
        for row in self.weather_data_list:
            yield f'{row[0]},{row[1]:.1f}'

In [46]:
url_weather = "https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date="  # met.hu url, only needs date appended
from_date = dt.date(2009,8,1) # data for Budapest is available from august 2009

As of 2020-10-26 the data from 2010-06-30 is incomplete, missing a max temp. So ws.run stops here, I added a made up number for this date, then ran it again. Also it contains data up to 2020-10-25.

In [130]:
ws = WeatherScraper(from_date, url_weather, 'metnet_2009_08_2020_10.csv')  # create weather scraper
ws.run()  # run weatherscraper

189/3771	5.01%
378/3771	10.02%
567/3771	15.04%
756/3771	20.05%
945/3771	25.06%
1134/3771	30.07%
1323/3771	35.08%
1512/3771	40.10%
1701/3771	45.11%
1890/3771	50.12%
2079/3771	55.13%
2268/3771	60.14%
2457/3771	65.16%
2646/3771	70.17%
2835/3771	75.18%
3024/3771	80.19%
3213/3771	85.20%
3402/3771	90.21%
3591/3771	95.23%
3771/3771	100.00%
2020-10-26: Could not calculate average temp


Gathering the data is complete, now on to the  deep learning part. For this I followed a tutorial on [towardsdatascience.com](https://towardsdatascience.com/weather-forecasting-a-deep-learning-approach-7ecddff0fa71). The structure of my model is the same as the one in the post, but I changed a few hyperparameters to fit the model better to my data. Also I shuffled my data in the preprocessing process, and just generally tried to do my own thing.

In [7]:
# import statements for neural network

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
import keras
import datetime as dt
import random

In [116]:
DATA_DIR = 'met_1901_2019.csv'  # csv file with the data

# read the csv file into a pandas dataframe, note the data is seperated by semicolons
df = pd.read_csv(DATA_DIR, sep=';') 

# calculate the daily average temperature as the mean of the min and max temperatures
# I know the datafram contains a daily average, but this was specified in the task
df['avg_temp'] = df[['d_tx', 'd_tn']].mean(axis=1)  

df.rename(columns = {'#datum':'date'}, inplace = True)

df.head()
# date: date
# d_ta: daily average temp (measured)
# d_tx: daily max temp (measured)
# d_tn: daily min temp (measured)
# d_rs: daily precipitation (measured)
# d_rf: daily precipitation type {snow, rain, etc} (measured)
# d_ss: daily sunshine in hours (measured)
# d_ssr: daily sunshine in j/cm^2 (measured)
# avg_temp: daily average temp (calculated)

Unnamed: 0,date,d_ta,d_tx,d_tn,d_rs,d_rf,d_ss,d_ssr,avg_temp
0,1901-01-01,-5.7,-0.4,-9.2,1.9,4.0,,,-4.8
1,1901-01-02,-9.3,-6.6,-11.3,0.0,,,,-8.95
2,1901-01-03,-9.1,-6.6,-10.8,0.8,4.0,,,-8.7
3,1901-01-04,-11.0,-9.8,-12.4,0.2,4.0,,,-11.1
4,1901-01-05,-11.1,-9.0,-15.5,0.0,,,,-12.25


In [9]:
# delete unused columns
del df['d_ta']
del df['d_tx']
del df['d_tn']
del df['d_rs']
del df['d_rf']
del df['d_ss']
del df['d_ssr']

df.head()

Unnamed: 0,date,avg_temp
0,1901-01-01,-4.8
1,1901-01-02,-8.95
2,1901-01-03,-8.7
3,1901-01-04,-11.1
4,1901-01-05,-12.25


In [10]:
df = df[-7300:]  # keep data from the last 20 years [hyperparameter]

In [12]:
x = []  # list containing the inputs
y = []  # list containing the outputs

prediction_days = [1, 7, 4*7]  # 1, 7, and 28 days in the future, as specified in the task
num_of_days = len(prediction_days)  # output size
memory_size = 180  # input size [hyperparameter]

# create inputs with data from the last 180 days, and outputs with the target predictions
for i in range(0, len(df.index) - memory_size - max(prediction_days) + 1):
  x.append(df.values[i:i + memory_size, 1].astype(dtype=np.float64))
  
  y_array = np.empty(num_of_days, dtype=np.float64)
  for index, day in enumerate(prediction_days):
    y_array[index] = df.values[i + memory_size + day - 1:i + memory_size + day, 1][0]
  y.append(y_array)

In [13]:
# shuffle the input and output list in the same fashion
random.seed(4)
random.shuffle(x)
random.seed(4)
random.shuffle(y)

print(len(x))  # total_data - size_of_memory - prediction_farthest_in_the_future + 1 = length of dataset
print(len(y))  # 7300 - 180 - 28 + 1 = 7093

7093
7093


In [14]:
train_split = 0.9  # proportion of the set to train the neural network on
split_idx = int(len(x) * train_split)  # get split index

x_train = x[:split_idx]  # split train input
y_train = y[:split_idx]  # split train output

x_test = x[split_idx:]  # split test input
y_test = y[split_idx:]  # split test output

# delete x and y, not needed anymore
del x
del y

print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

6383
6383
710
710


In [15]:
# format train data to be usable by the neural network

x_train , y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 1))

In [27]:
x_train.shape

(6383, 180, 1)

In [18]:
EPOCHS = 500     # num of epochs [hyperparameter]
BATCH_SIZE = 64  # batch size [hyperparameter]


# created a regressor with the same structure as the one ion the blog post
# I increased the units in each a LSTM layer to account for the longer input size a bit more
regressor = Sequential()

regressor.add(Bidirectional(LSTM(units=90, return_sequences=True, input_shape = (x_train.shape[1], 1))))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=60, return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=40, return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=30))
regressor.add(Dropout(0.2))
regressor.add(Dense(units = num_of_days, activation='relu'))

regressor.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc'])

regressor.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc40a80c710>

Training is complete, and we have a 78% accuracy rating. Obviously this is far from perfect, but maybe it can provide som useful results.

In [117]:
regressor.save('regressors/trained_model_1')


INFO:tensorflow:Assets written to: regressors/trained_model_1/assets


In [118]:
regressor = keras.models.load_model('regressors/trained_model_1')

In [119]:
print(regressor.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 180, 180)          66240     
_________________________________________________________________
dropout (Dropout)            (None, 180, 180)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 180, 60)           57840     
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 60)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 180, 40)           16160     
_________________________________________________________________
dropout_2 (Dropout)          (None, 180, 40)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 30)                8

In [35]:
# test a bit on the test dataset

x_test , y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0] , x_test.shape[1], 1))

In [36]:
x_test.shape

(710, 180, 1)

Now for the moment of truth. As you can see in the cell below, the predictions aren't perfect, but I'm very happy, with how close most of them are to the actual temperatures.

In [125]:
# lets see a few predictions
num_preds = 20

predicted_temperature = regressor.predict(x_test[:num_preds])
print('day+1_pred\tday+7_pred\tday+28_pred\t||\tday+1_actual\tday+7_actual\tday+28_actual')
for i in range(num_preds):
  print(f'{predicted_temperature[i][0]:.1f}\t\t{predicted_temperature[i][1]:.1f}\t\t{predicted_temperature[i][2]:.1f}\t\t||\t{y_test[i][0]:.1f}\t\t{y_test[i][1]:.1f}\t\t{y_test[i][2]:.1f}')


day+1_pred	day+7_pred	day+28_pred	||	day+1_actual	day+7_actual	day+28_actual
27.4		19.6		20.5		||	28.7		15.8		20.7
8.8		4.7		7.5		||	7.1		3.5		11.4
19.0		17.0		23.7		||	22.3		15.9		22.1
8.4		6.4		1.6		||	5.8		4.6		3.4
3.8		0.3		4.7		||	2.0		3.1		4.4
0.0		0.0		7.8		||	-5.0		2.9		2.2
21.9		25.2		21.0		||	23.1		23.1		26.7
6.9		9.6		6.5		||	9.8		8.4		6.9
25.6		26.0		17.8		||	24.2		26.9		14.0
5.0		6.1		0.0		||	3.2		5.0		-1.0
7.2		2.6		0.0		||	6.0		3.6		2.3
6.4		5.1		7.4		||	7.2		7.4		14.6
25.5		16.7		14.9		||	28.1		15.9		14.8
5.7		4.6		0.0		||	4.8		7.3		-2.3
20.4		15.0		18.5		||	15.0		18.9		19.8
0.0		0.0		0.0		||	0.7		0.4		0.4
20.9		21.3		23.5		||	20.4		22.6		26.3
5.2		2.8		4.8		||	3.8		4.5		9.3
23.2		17.9		12.2		||	19.5		16.9		10.4
2.9		8.1		14.2		||	9.0		8.6		14.4


In [131]:
def predict_days_to_date(to_date, mem_size=180, data_file='metnet_2009_08_2020_10.csv'):
  '''predict daily average values up to the date specified'''
  ws = WeatherScraper(None, None, data_file)
  last_date = ws.last_date  # gat last available date from file

  d1 = last_date
  d2 = dt.datetime.strptime(to_date, '%Y-%m-%d').date()
  date_diff = abs((d2 - d1).days)  # calculate the date difference between last date and the param date

  df = pd.read_csv(data_file)  # read csv file into dataframe
  # create input array of size mem_size filled with the last rows of the dataframe as float64
  df = df[-mem_size:]
  x = df.values[:,1].astype(dtype=np.float64)  
  
  for i in range(date_diff):  # for every missing day
    # format input array to be able to be understood by the neural network
    x_inp = np.array([x])
    x_inp = np.reshape(x, (1 , x.shape[0], 1))

    # predict weather for missing date
    predicted_temperature = regressor.predict(x_inp)

    # shift input array left, push new day data (fifo structure)
    x[:-1] = x[1:]
    x[-1] = predicted_temperature[0][0]

  # x is now an input array of size mem_size with the data for the missing days filled in
  x_inp = np.array([x])
  x_inp = np.reshape(x, (1 , x.shape[0], 1))
  return x_inp

In [133]:
# predict weather for the days specified in the task
predicted_temperature = regressor.predict(predict_days_to_date('2020-10-27'))
print(f'date\t\taverage temperature')
print(f'2020-10-28\t{predicted_temperature[0][0]:.1f} °C')
print(f'2020-11-03\t{predicted_temperature[0][1]:.1f} °C')
print(f'2020-11-24\t{predicted_temperature[0][2]:.1f} °C')

date		average temperature
2020-10-28	12.4 °C
2020-11-03	14.4 °C
2020-11-24	7.0 °C


## Conclusion

I am very happy with how this turned out. There are definitely areas to improve on. But an almost 80% accuracy is better than i would have anticipated.