# All Seasons - 6 different weather conditions(long time frame - Second-Order Markov Chain)

## Import libraries and dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
date_format = "%Y-%m-%d"

In [2]:
all_seasons = pd.read_csv('Datasets/all_seasons.csv')
all_seasons = all_seasons[['datetime', 'conditions']]

In [3]:
all_seasons.head()

Unnamed: 0,datetime,conditions
0,2000-01-01,Partially cloudy
1,2000-01-02,Clear
2,2000-01-03,Clear
3,2000-01-04,Clear
4,2000-01-05,Clear


## Classify and separate data

In [4]:
classifier = {'Overcast':'overcast', 'Partially cloudy':'partially_cloudy', 'Clear':'clear', 'Rain, Partially cloudy':'rain_partially_cloudy', 'Rain':'rain', 'Rain, Overcast':'rain_overcast'}

all_seasons['condition'] = all_seasons['conditions'].map(classifier)

In [5]:
all_seasons.head()

Unnamed: 0,datetime,conditions,condition
0,2000-01-01,Partially cloudy,partially_cloudy
1,2000-01-02,Clear,clear
2,2000-01-03,Clear,clear
3,2000-01-04,Clear,clear
4,2000-01-05,Clear,clear


In [6]:
all_seasons = all_seasons[['datetime', 'condition']]

In [7]:
all_seasons.head()

Unnamed: 0,datetime,condition
0,2000-01-01,partially_cloudy
1,2000-01-02,clear
2,2000-01-03,clear
3,2000-01-04,clear
4,2000-01-05,clear


In [8]:
train_start_date = '2002-01-01'
train_end_date = '2017-12-31'
all_seasons_train = all_seasons.loc[all_seasons['datetime'].between(train_start_date, train_end_date)]
all_seasons_train = all_seasons_train.reset_index()

test_start_date = '2018-01-01'
test_end_date = '2021-12-31'
all_seasons_test = all_seasons.loc[all_seasons['datetime'].between(test_start_date, test_end_date)]
all_seasons_test = all_seasons_test.reset_index()

## Calculate proportions of conditions & Create transition matrix

In [9]:
# Initialize count variables

# 0: 'clear' - C
# 1: 'partially_cloudy' - PC
# 2: 'overcast' - OV
# 3: 'rain' - R
# 4: 'rain_partially_cloudy' - RPC
# 5: 'rain_overcast' - ROV

conditions = ['clear', 'partially_cloudy', 'overcast', 'rain', 'rain_partially_cloudy', 'rain_overcast']
prev_conditions = [f"{state_0}->{state_1}" for state_0 in conditions for state_1 in conditions]
prev_conditions

['clear->clear',
 'clear->partially_cloudy',
 'clear->overcast',
 'clear->rain',
 'clear->rain_partially_cloudy',
 'clear->rain_overcast',
 'partially_cloudy->clear',
 'partially_cloudy->partially_cloudy',
 'partially_cloudy->overcast',
 'partially_cloudy->rain',
 'partially_cloudy->rain_partially_cloudy',
 'partially_cloudy->rain_overcast',
 'overcast->clear',
 'overcast->partially_cloudy',
 'overcast->overcast',
 'overcast->rain',
 'overcast->rain_partially_cloudy',
 'overcast->rain_overcast',
 'rain->clear',
 'rain->partially_cloudy',
 'rain->overcast',
 'rain->rain',
 'rain->rain_partially_cloudy',
 'rain->rain_overcast',
 'rain_partially_cloudy->clear',
 'rain_partially_cloudy->partially_cloudy',
 'rain_partially_cloudy->overcast',
 'rain_partially_cloudy->rain',
 'rain_partially_cloudy->rain_partially_cloudy',
 'rain_partially_cloudy->rain_overcast',
 'rain_overcast->clear',
 'rain_overcast->partially_cloudy',
 'rain_overcast->overcast',
 'rain_overcast->rain',
 'rain_overcast->r

In [10]:
# Adding a column to identify past two states

for i in range(2, len(all_seasons_train)):
    state_0 = all_seasons_train.loc[i-2, 'condition']
    state_1 = all_seasons_train.loc[i-1, 'condition']
    all_seasons_train.loc[i, 'prev_states'] = f"{state_0}->{state_1}"
    
all_seasons_train

Unnamed: 0,index,datetime,condition,prev_states
0,731,2002-01-01,partially_cloudy,
1,732,2002-01-02,rain_partially_cloudy,
2,733,2002-01-03,rain_partially_cloudy,partially_cloudy->rain_partially_cloudy
3,734,2002-01-04,partially_cloudy,rain_partially_cloudy->rain_partially_cloudy
4,735,2002-01-05,partially_cloudy,rain_partially_cloudy->partially_cloudy
...,...,...,...,...
5839,6570,2017-12-27,clear,clear->clear
5840,6571,2017-12-28,clear,clear->clear
5841,6572,2017-12-29,clear,clear->clear
5842,6573,2017-12-30,partially_cloudy,clear->clear


In [11]:
# Creating a count matrix 
# transition_counts = prev_conditions x conditions matrix

transition_counts = np.zeros((len(prev_conditions), len(conditions)))

for i in range(len(transition_counts)):
    for j in range(len(transition_counts[0])):
        transition_counts[i][j] = len(all_seasons_train[(all_seasons_train.condition == conditions[j]) & (all_seasons_train.prev_states == prev_conditions[i])])

transition_counts

array([[1.174e+03, 4.200e+02, 2.000e+00, 3.200e+01, 7.900e+01, 6.000e+00],
       [1.400e+02, 3.510e+02, 1.200e+01, 5.000e+00, 6.700e+01, 1.500e+01],
       [1.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
       [3.300e+01, 8.000e+00, 0.000e+00, 8.000e+00, 4.000e+00, 0.000e+00],
       [3.000e+01, 2.700e+01, 0.000e+00, 1.100e+01, 4.000e+01, 1.000e+01],
       [2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 1.000e+00],
       [3.440e+02, 1.170e+02, 1.000e+00, 1.200e+01, 2.100e+01, 0.000e+00],
       [2.900e+02, 1.027e+03, 5.600e+01, 1.500e+01, 9.600e+01, 2.300e+01],
       [1.000e+00, 5.300e+01, 1.100e+01, 0.000e+00, 1.000e+01, 6.000e+00],
       [1.800e+01, 5.000e+00, 0.000e+00, 3.000e+00, 5.000e+00, 0.000e+00],
       [5.800e+01, 5.600e+01, 3.000e+00, 5.000e+00, 6.800e+01, 1.000e+01],
       [6.000e+00, 9.000e+00, 0.000e+00, 2.000e+00, 2.200e+01, 4.000e+00],
       [2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
       [1.500e+01, 4.100e

In [12]:
# Turning count matrix into proportions by normalizing across rows

def normalize(arr):
    total = sum(arr)
    if total == 0:
        return arr
    return arr / total

transition_prob = np.apply_along_axis(normalize, 1, transition_counts)
transition_prob

array([[0.68534734, 0.24518389, 0.00116754, 0.01868068, 0.04611792,
        0.00350263],
       [0.23728814, 0.59491525, 0.02033898, 0.00847458, 0.11355932,
        0.02542373],
       [0.33333333, 0.66666667, 0.        , 0.        , 0.        ,
        0.        ],
       [0.62264151, 0.1509434 , 0.        , 0.1509434 , 0.0754717 ,
        0.        ],
       [0.25423729, 0.22881356, 0.        , 0.09322034, 0.33898305,
        0.08474576],
       [0.28571429, 0.        , 0.        , 0.        , 0.57142857,
        0.14285714],
       [0.69494949, 0.23636364, 0.0020202 , 0.02424242, 0.04242424,
        0.        ],
       [0.1924353 , 0.6814864 , 0.03715992, 0.00995355, 0.06370272,
        0.01526211],
       [0.01234568, 0.65432099, 0.13580247, 0.        , 0.12345679,
        0.07407407],
       [0.58064516, 0.16129032, 0.        , 0.09677419, 0.16129032,
        0.        ],
       [0.29      , 0.28      , 0.015     , 0.025     , 0.34      ,
        0.05      ],
       [0.13953488, 0

In [13]:
# Verifying rows sum to 1
np.apply_along_axis(sum, 1, transition_prob)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1.])

In [14]:
all_seasons_test.head(1)

Unnamed: 0,index,datetime,condition
0,6575,2018-01-01,clear


First day of 2018: clear

## Below cells are commented out to avoid errors when running