# Import libraries and dataset

In [69]:
import pandas as pd
import numpy as np
from datetime import datetime
date_format = "%Y-%m-%d"

In [70]:
all_seasons = pd.read_csv('Datasets/all_seasons.csv')
all_seasons = all_seasons[['datetime', 'conditions']]

In [71]:
all_seasons.head()

Unnamed: 0,datetime,conditions
0,2000-01-01,Partially cloudy
1,2000-01-02,Clear
2,2000-01-03,Clear
3,2000-01-04,Clear
4,2000-01-05,Clear


# Classify and separate data

In [72]:
simplifier = {'Overcast':'no_rain', 'Partially cloudy':'no_rain', 'Clear':'no_rain', 'Rain, Partially cloudy':'rain', 'Rain':'rain', 'Rain, Overcast':'rain'}

all_seasons['condition'] = all_seasons['conditions'].map(simplifier)

In [73]:
all_seasons.head()

Unnamed: 0,datetime,conditions,condition
0,2000-01-01,Partially cloudy,no_rain
1,2000-01-02,Clear,no_rain
2,2000-01-03,Clear,no_rain
3,2000-01-04,Clear,no_rain
4,2000-01-05,Clear,no_rain


In [74]:
all_seasons = all_seasons[['datetime', 'condition']]

In [75]:
all_seasons.head()

Unnamed: 0,datetime,condition
0,2000-01-01,no_rain
1,2000-01-02,no_rain
2,2000-01-03,no_rain
3,2000-01-04,no_rain
4,2000-01-05,no_rain


In [76]:
train_start_date = '2002-01-01'
train_end_date = '2017-12-31'
all_seasons_train = all_seasons.loc[all_seasons['datetime'].between(train_start_date, train_end_date)]
all_seasons_train = all_seasons_train.reset_index()

test_start_date = '2018-01-01'
test_end_date = '2021-12-31'
all_seasons_test = all_seasons.loc[all_seasons['datetime'].between(test_start_date, test_end_date)]
all_seasons_test = all_seasons_test.reset_index()

# Calculate proportions of conditions & Create transition matrix

We will refer to rain is 'R' and no rain as 'N'

In [77]:
# Initialize count variables
R_after_R_count = 0.0
N_after_R_count = 0.0

R_after_N_count = 0.0
N_after_N_count = 0.0

In [78]:
all_seasons_train

Unnamed: 0,index,datetime,condition
0,731,2002-01-01,no_rain
1,732,2002-01-02,rain
2,733,2002-01-03,rain
3,734,2002-01-04,no_rain
4,735,2002-01-05,no_rain
...,...,...,...
5839,6570,2017-12-27,no_rain
5840,6571,2017-12-28,no_rain
5841,6572,2017-12-29,no_rain
5842,6573,2017-12-30,no_rain


In [79]:
# Count conditions

all_seasons_train['condition_shift'] = all_seasons_train['condition'].shift(-1)

for i in range(len(all_seasons_train)):
    if all_seasons_train.loc[i, 'condition'] == 'rain' and all_seasons_train.loc[i, 'condition_shift'] == 'rain':
        R_after_R_count += 1
    elif all_seasons_train.loc[i, 'condition'] == 'no_rain' and all_seasons_train.loc[i, 'condition_shift'] == 'rain':
        N_after_R_count += 1
    elif all_seasons_train.loc[i, 'condition'] == 'rain' and all_seasons_train.loc[i, 'condition_shift'] == 'no_rain':
        R_after_N_count += 1
    elif all_seasons_train.loc[i, 'condition'] == 'no_rain' and all_seasons_train.loc[i, 'condition_shift'] == 'no_rain':
        N_after_N_count += 1

In [80]:
current_R_total = R_after_R_count + N_after_R_count
current_N_total = R_after_N_count + N_after_N_count

In [81]:
R_after_R_prob = R_after_R_count / current_R_total
N_after_R_prob = N_after_R_count / current_R_total

R_after_N_prob = R_after_N_count / current_N_total
N_after_N_prob = N_after_N_count / current_N_total

In [82]:
# Printing our probabilities for 2x2 transition matrix:
print(R_after_R_prob)
print(N_after_R_prob)
print(R_after_N_prob)
print(N_after_N_prob)

0.4674887892376682
0.5325112107623319
0.09594021409816199
0.904059785901838


In [83]:
# Checking that each row in the transition matrix adds up to 1:
print(R_after_R_prob + N_after_R_prob)
print(R_after_N_prob + N_after_N_prob)

1.0
1.0


In [84]:
# Creating the transition matrix:
transition_name = [['RR', 'RN'], ['RN', 'NN']]
transition_matrix = [[R_after_R_prob, N_after_R_prob], [R_after_N_prob, N_after_N_prob]]
print(transition_matrix)

[[0.4674887892376682, 0.5325112107623319], [0.09594021409816199, 0.904059785901838]]


In [85]:
t_array = np.array(transition_matrix)
print(t_array)

[[0.46748879 0.53251121]
 [0.09594021 0.90405979]]


In [86]:
state = {0:'Rain', 1:'No Rain'}

In [87]:
print(all_seasons_test.head(1))

   index    datetime condition
0   6575  2018-01-01   no_rain


First Day of 2018: No Rain

In [89]:
n = len(all_seasons_test) #how many steps to test
start_state = 1 #1 = No Rain
test_result = all_seasons_test[['datetime']]

prev_state = start_state
result = []
while n-1:
    curr_state = np.random.choice([0,1], p=t_array[prev_state]) #taking the probability from the transition matrix
    result.append(state[curr_state])
    prev_state = curr_state
    n -= 1

curr_state = np.random.choice([0,1], p=t_array[prev_state]) #taking the probability from the transition matrix
result.append(state[curr_state])

test_result['result_condition'] = result
print(len(result))
print(len(test_result))
print(test_result)

1461
1461
        datetime result_condition
0     2018-01-01          No Rain
1     2018-01-02          No Rain
2     2018-01-03          No Rain
3     2018-01-04             Rain
4     2018-01-05          No Rain
...          ...              ...
1456  2021-12-27          No Rain
1457  2021-12-28          No Rain
1458  2021-12-29          No Rain
1459  2021-12-30          No Rain
1460  2021-12-31          No Rain

[1461 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_result['result_condition'] = result
