In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.metrics import normalized_mutual_info_score

In [2]:
price = pd.read_csv('price_demand_data.csv')
price.head()

Unnamed: 0,REGION,SETTLEMENTDATE,TOTALDEMAND,PRICECATEGORY
0,VIC1,1/01/2021 0:30,4179.21,LOW
1,VIC1,1/01/2021 1:00,4047.76,LOW
2,VIC1,1/01/2021 1:30,3934.7,LOW
3,VIC1,1/01/2021 2:00,3766.45,LOW
4,VIC1,1/01/2021 2:30,3590.37,LOW


##### We need to summarize data as we don’t need to know the demands for different hours in a day. So we clean all the hours from SETTLEMENT column

In [3]:
price['SETTLEMENTDATE'] = price['SETTLEMENTDATE'].str.rstrip(':0123456789')
price['SETTLEMENTDATE'] = price['SETTLEMENTDATE'].str.rstrip(' ')

##### Let's check what we have in REGION column? 

In [4]:
price['REGION'].describe()

count     11664
unique        1
top        VIC1
freq      11664
Name: REGION, dtype: object

##### All the data is for VIC1 so we drop REGION column.

##### We put Date as index in both DataFrame so we can merge them together later, before doing that we rename SETTLEMENTDATE to Date.

In [5]:
price = price.drop(['REGION'] , axis = 1)
price = price.rename(columns={'SETTLEMENTDATE': 'Date'})
price.set_index('Date')
price.head()

Unnamed: 0,Date,TOTALDEMAND,PRICECATEGORY
0,1/01/2021,4179.21,LOW
1,1/01/2021,4047.76,LOW
2,1/01/2021,3934.7,LOW
3,1/01/2021,3766.45,LOW
4,1/01/2021,3590.37,LOW


##### To find maximum daily price category, we replace 'LOW' to 1 , 'MEDIUM' to 2, 'HIGH' to 3 and 'EXTREME' to 4 to find maximum price category, then we will replace to original

In [6]:
price=price.replace(to_replace="LOW",value=1)

In [7]:
price=price.replace(to_replace="MEDIUM",value=2)

In [8]:
price=price.replace(to_replace="HIGH",value=3)

In [9]:
price=price.replace(to_replace="EXTREME",value=4)

In [10]:
price.head()

Unnamed: 0,Date,TOTALDEMAND,PRICECATEGORY
0,1/01/2021,4179.21,1
1,1/01/2021,4047.76,1
2,1/01/2021,3934.7,1
3,1/01/2021,3766.45,1
4,1/01/2021,3590.37,1


##### Now we groupby by date and aggregate as 'TOTALDEMAND': 'max' , 'PRICECATEGORY': 'max', we think that for total demand we should consider maximum demand not adding all the demand during everyday. We change name of TOTALDEMAND to Max_Demand

In [11]:

price1=price.groupby('Date').agg({'TOTALDEMAND': 'max' , 'PRICECATEGORY': 'max'})
price1 = price1.rename(columns={'TOTALDEMAND': 'Max_Demand'})

price1.head()

Unnamed: 0_level_0,Max_Demand,PRICECATEGORY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1/01/2021,5019.64,1
1/02/2021,5228.29,2
1/03/2021,5225.37,1
1/04/2021,5807.02,2
1/05/2021,5261.09,2


##### Now we change the values of 'PRICECATEGORY' to original ones:

In [12]:
price1=price1.replace(to_replace=1,value="LOW")

In [13]:
price1=price1.replace(to_replace=2,value="MEDIUM")

In [14]:
price1=price1.replace(to_replace=3,value="HIGH")

In [15]:
price1=price1.replace(to_replace=4,value="EXTREME")

In [16]:
price1.head()

Unnamed: 0_level_0,Max_Demand,PRICECATEGORY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1/01/2021,5019.64,LOW
1/02/2021,5228.29,MEDIUM
1/03/2021,5225.37,LOW
1/04/2021,5807.02,MEDIUM
1/05/2021,5261.09,MEDIUM


##### Let's have a look at second DataFrame about weather:

In [17]:
weather = pd.read_csv('weather_data.csv')

In [18]:
weather.head()

Unnamed: 0,Date,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,Speed of maximum wind gust (km/h),Time of maximum wind gust,9am Temperature (°C),...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
0,1/01/2021,15.6,29.9,0.0,2.8,9.3,NNE,31.0,13:14,19.2,...,6,N,2,1018.8,28.1,43,5.0,E,13,1015.3
1,2/01/2021,18.4,29.0,0.0,9.4,1.3,NNW,30.0,8:22,23.3,...,7,NNW,17,1013.3,28.7,38,7.0,SW,4,1008.5
2,3/01/2021,17.0,26.2,12.6,4.8,7.1,WSW,33.0,17:55,18.3,...,8,WSW,4,1007.7,23.5,59,4.0,SSW,2,1005.2
3,4/01/2021,16.0,18.6,2.6,3.8,0.0,SSE,41.0,16:03,16.2,...,8,SSE,11,1010.0,18.2,82,8.0,SSW,17,1011.0
4,5/01/2021,15.9,19.1,11.2,1.0,0.0,SSE,35.0,11:02,17.2,...,8,SSE,13,1012.5,18.2,82,8.0,SSE,19,1013.3


##### We calculate Average temperature and Range per day and add to DataFrame:

In [19]:
weather['Average temperature (°C)'] = (weather['Minimum temperature (°C)'] + weather['Maximum temperature (°C)']) / 2
weather['Temperature range (°C)'] = weather['Maximum temperature (°C)'] - weather['Minimum temperature (°C)']

##### It looks some columns dosen't effect on electricity demand, so let's keep some which might be help us.

In [22]:
weather1 = weather[['Date', 'Minimum temperature (°C)' , 'Maximum temperature (°C)', 'Average temperature (°C)' , 'Temperature range (°C)', 'Sunshine (hours)']]

##### We put Date as index in both DataFrame so we can merge them together later.

In [23]:
weather1 = weather1.set_index('Date')

In [24]:
weather1.head()

Unnamed: 0_level_0,Minimum temperature (°C),Maximum temperature (°C),Average temperature (°C),Temperature range (°C),Sunshine (hours)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/01/2021,15.6,29.9,22.75,14.3,9.3
2/01/2021,18.4,29.0,23.7,10.6,1.3
3/01/2021,17.0,26.2,21.6,9.2,7.1
4/01/2021,16.0,18.6,17.3,2.6,0.0
5/01/2021,15.9,19.1,17.5,3.2,0.0


In [None]:
result = pd.merge(weather1,price1, left_index=True,right_index=True, how='inner')

In [None]:
result

##### we can't calculate  mutual information if there are some NaN in DataFrame, so let's drop all data with NaN value.

In [None]:
result=result.dropna(axis=0)

In [None]:
result

##### There is just one row which had NaN so it won't effect much in our model. 

In [None]:
plt.scatter(result['Average temperature (°C)'],result['Max_Demand'])
plt.show()

print("Pearson r is ",result['Average temperature (°C)'].corr(result['Max_Demand']))

In [None]:
result['Max_Demand_binned'] = pd.cut(result['Max_Demand'], bins = 4)

In [None]:
result['Average temperature (°C)_binned'] = pd.cut(result['Average temperature (°C)'], bins = 4)

In [None]:
result.head()

In [None]:
normalized_mutual_info_score(result['Max_Demand_binned'], result['Average temperature (°C)_binned'], average_method='min')

In [None]:
result1 = result.loc[result['Average temperature (°C)'] <= 18]
result1.head()

In [None]:
plt.scatter(result1['Average temperature (°C)'],result1['Max_Demand'])
plt.show()

print("Pearson r is ",result1['Average temperature (°C)'].corr(result1['Max_Demand']))

In [None]:
result1['Max_Demand_binned'] = pd.cut(result1['Max_Demand'], bins = 4)

In [None]:
result1['Average temperature (°C)_binned'] = pd.cut(result1['Average temperature (°C)'], bins = 4)

In [None]:
result1.head()

In [None]:
normalized_mutual_info_score(result1['Max_Demand_binned'], result1['Average temperature (°C)_binned'], average_method='min')

In [None]:
result2 = result.loc[result['Average temperature (°C)'] >= 18]
result2.head()

In [None]:
plt.scatter(result2['Average temperature (°C)'],result2['Max_Demand'])
plt.show()

print("Pearson r is ",result2['Average temperature (°C)'].corr(result2['Max_Demand']))

In [None]:
result2['Max_Demand_binned'] = pd.cut(result2['Max_Demand'], bins = 4)

In [None]:
result2['Average temperature (°C)_binned'] = pd.cut(result2['Average temperature (°C)'], bins = 4)

In [None]:
result2.head()

In [None]:
normalized_mutual_info_score(result2['Max_Demand_binned'], result2['Average temperature (°C)_binned'], average_method='min')

In [None]:
plt.scatter(result2['Sunshine (hours)'],result2['Max_Demand'])
plt.show()

print("Pearson r is ",result2['Sunshine (hours)'].corr(result2['Max_Demand']))

In [None]:
result2['Sunshine (hours)_binned'] = pd.cut(result2['Sunshine (hours)'], bins = 4)

In [None]:
normalized_mutual_info_score(result2['Max_Demand_binned'], result2['Sunshine (hours)_binned'], average_method='min')

In [None]:
normalized_mutual_info_score(result['PRICECATEGORY'], result['Average temperature (°C)_binned'], average_method='min')

In [None]:
normalized_mutual_info_score(result1['PRICECATEGORY'], result1['Average temperature (°C)_binned'], average_method='min')

In [None]:
normalized_mutual_info_score(result2['PRICECATEGORY'], result2['Average temperature (°C)_binned'], average_method='min')

In [None]:
result['Maximum temperature (°C)_binned'] = pd.cut(result['Maximum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result['PRICECATEGORY'], result['Maximum temperature (°C)_binned'], average_method='min')

In [None]:
result1['Maximum temperature (°C)_binned'] = pd.cut(result1['Maximum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result1['PRICECATEGORY'], result1['Maximum temperature (°C)_binned'], average_method='min')

In [None]:
result2['Maximum temperature (°C)_binned'] = pd.cut(result2['Maximum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result2['PRICECATEGORY'], result2['Maximum temperature (°C)_binned'], average_method='min')

In [None]:
result['Miniimum temperature (°C)_binned'] = pd.cut(result['Minimum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result['PRICECATEGORY'], result['Maximum temperature (°C)_binned'], average_method='min')

In [None]:
result1['Minimum temperature (°C)_binned'] = pd.cut(result1['Minimum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result1['PRICECATEGORY'], result1['Minimum temperature (°C)_binned'], average_method='min')

In [None]:
result2['Miniimum temperature (°C)_binned'] = pd.cut(result2['Minimum temperature (°C)'], bins = 4)

In [None]:
normalized_mutual_info_score(result2['PRICECATEGORY'], result2['Maximum temperature (°C)_binned'], average_method='min')

In [None]:
normalized_mutual_info_score(result['PRICECATEGORY'], result['Max_Demand_binned'], average_method='min')