In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [3]:
try:
    raw_solar = pd.read_csv('energy_dataset.csv')
    raw_weather = pd.read_csv('weather_features.csv')
except FileNotFoundError as e:
    print(e)
    print('Attempting to read datasets from online csv files on Github')
    raw_solar = pd.read_csv('https://github.com/cobyoram/Thinkful_capstone_1/blob/master/energy_dataset.csv')
    raw_weather = pd.read_csv('https://github.com/cobyoram/Thinkful_capstone_1/blob/master/weather_features.csv')

In [4]:
# First, to clean up the 'time' columns, which are time str. We want them to be pd datetime objects for nice graphing

# The original data came with time-zone aware data. We don't need this for now, so we will get rid of those parts.
def strip_deltas(array):
    for i in array:
        try:
            end = i.index('+')
        except ValueError as e:
            print(e, 'no + in array')
            end = len(array)
        array.replace(i, i[0:end], inplace=True) # Removes everything past + for data that looks like YYYY:MM:DD hh-mm-ss+hh:mm



# Remove the timezone deltas from the time strings (This part takes a while)
# strip_deltas(raw_solar['time'])
# strip_deltas(raw_weather['dt_iso'])

# Turn those columns into datetime values with date and time
raw_solar['time'] = pd.to_datetime(raw_solar['time'])
raw_weather['time'] = pd.to_datetime(raw_weather['dt_iso'])

# Create date columns using dt.date for day-wise aggregation later
raw_solar['date'] = raw_solar['time'].dt.date
raw_weather['date'] = raw_weather['time'].dt.date

ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True

In [0]:
raw_solar.head()

In [0]:
raw_weather.tail(50)

In [0]:
raw_weather[raw_weather['clouds_all'] == 100]

In [0]:
# Discover all weather descriptive values
print(raw_weather['weather_main'].value_counts())
print(raw_weather['weather_description'].value_counts())

In [0]:
print('not clear', raw_weather.loc[raw_weather['weather_main'] != 'clear']['time'].count())
print('clear', raw_weather.loc[raw_weather['weather_main'] == 'clear']['time'].count())

g = sns.FacetGrid(data=raw_weather, col='weather_main')
g.map(plt.hist, 'clouds_all')

In [0]:
# Let's try using these
less_clouds = raw_weather.loc[raw_weather['clouds_all'] <= raw_weather['clouds_all'].median()]
more_clouds = raw_weather.loc[raw_weather['clouds_all'] > raw_weather['clouds_all'].median()]
print('>median', less_clouds['time'].count())
print('<median', more_clouds['time'].count())

plt.hist(less_clouds['clouds_all'], alpha=.5)
plt.hist(more_clouds['clouds_all'], alpha=.5, bins=40)
plt.show()

In [0]:
# Let's try to organize the data by date, then location, and only look at the clouds_all value
cities = raw_weather['city_name'].unique()
city_pivot_less = pd.pivot_table(less_clouds, columns='city_name', values='clouds_all', index='time')
city_pivot_less = city_pivot_less.reset_index()

In [0]:
city_pivot_less['cloud_coverage'] = city_pivot_less[cities].aggregate('mean', axis=1)
plt.hist(city_pivot_less['cloud_coverage'])

In [0]:
city_pivot_more = pd.pivot_table(more_clouds, columns='city_name', values='clouds_all', index='time')
city_pivot_more = city_pivot_more.reset_index()
city_pivot_more['cloud_coverage'] = city_pivot_more[cities].aggregate('mean', axis=1)
plt.hist(city_pivot_more['cloud_coverage'])

In [0]:
city_pivot_more.describe()

In [0]:
clouds_cities = raw_weather.loc[:,['time', 'city_name', 'clouds_all']]
clouds_cities = clouds_cities.pivot_table(index='time', columns='city_name', values='clouds_all')
clouds_cities = clouds_cities.reset_index()
clouds_cities['mean_coverage'] = clouds_cities[cities].aggregate('mean', axis=1)
plt.hist(clouds_cities['mean_coverage'])
plt.show()
clouds_cities.head()

In [0]:
median = clouds_cities['mean_coverage'].median()
print(median)

clouds_cities['category'] = 'less coverage'
clouds_cities['category'] = clouds_cities['category'].where(clouds_cities['mean_coverage'] <= median, 'more coverage')
clouds_cities

In [0]:
time_solar = pd.DataFrame(raw_solar.loc[:,['time','generation solar']])
time_clouds = pd.DataFrame(clouds_cities.loc[:,['time','category']])

In [0]:
time_solar

In [0]:
time_clouds

In [0]:
solar_clouds = time_solar.set_index('time').join(time_clouds.set_index('time'))
solar_clouds

In [0]:
sns.lineplot(x=solar_clouds.index, y='generation solar', hue='category', data=solar_clouds)
plt.xlim(pd.to_datetime('2015-01-01'), None)

In [0]:
plt.hist(solar_clouds[solar_clouds['category'] == 'less coverage']['generation solar'], alpha=.5)
plt.hist(solar_clouds[solar_clouds['category'] == 'more coverage']['generation solar'], alpha=.5)
plt.show()

In [0]:
mwu = stats.mannwhitneyu(solar_clouds[solar_clouds['category'] == 'less coverage']['generation solar'], solar_clouds[solar_clouds['category'] == 'more coverage']['generation solar'])
mwu

In [0]:
kru = stats.kruskal(solar_clouds[solar_clouds['category'] == 'less coverage']['generation solar'], solar_clouds[solar_clouds['category'] == 'more coverage']['generation solar'], nan_policy='omit')
kru

In [0]:
less_cov = solar_clouds[solar_clouds['category'] == 'less coverage']['generation solar'].median()
print('less coverage', less_cov)
more_cov = solar_clouds[solar_clouds['category'] == 'more coverage']['generation solar'].median()
print('more coverage', more_cov)
ratio = more_cov / less_cov
print('ratio', ratio)
print('percent drop with higher coverage: {}%'.format((1 - ratio)*100))

In [0]:
sns.lineplot(y='generation solar', x='time', data=solar_clouds.loc[solar_clouds.index < pd.to_datetime('2015-01-02')].reset_index())

In [0]:
daytime_solar_clouds = solar_clouds.loc[solar_clouds['generation solar'] > 500]

In [0]:
plt.hist(daytime_solar_clouds['generation solar'])

In [0]:
sns.scatterplot(x=daytime_solar_clouds.index, y='generation solar', hue='category', data=daytime_solar_clouds)
plt.xlim(pd.to_datetime('2015-01-01'), None)

In [0]:
stats.mannwhitneyu(daytime_solar_clouds[daytime_solar_clouds['category'] == 'less coverage']['generation solar'], daytime_solar_clouds[daytime_solar_clouds['category'] == 'more coverage']['generation solar'])