This notebook details the missing data in the microgrid observations.


Methods:

- what percentage of the sampling range covered for each village by timestamp data?
- what percentage is covered when we include on/off messages?

TODO:

- 2017-07-13 has original work in outline
- 2017-07-04 also has related work

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import WP19_analysis as wpa

In [3]:
# Data samples present in observations

data = {}
for rfd in wpa.raw_file_data[2:]:
    energy = wpa.load_timeseries_file(rfd['village_name'] + '-clean.csv')
    start_date = energy.index[0]
    end_date = energy.index[-1]
    duration = (end_date - start_date)/np.timedelta64(1,'D')
    expected_samples = duration * 24 * 60
    num_samples = len(energy)
    coverage = num_samples / expected_samples
    #print(duration)
    data[rfd['village_name']] = {'start_date':energy.index[0],
                                 'end_date':energy.index[-1],
                                 'duration':duration,
                                 'expected_samples':expected_samples,
                                 'num_samples':num_samples,
                                 'coverage':coverage}
pd.DataFrame(data).T

Unnamed: 0,coverage,duration,end_date,expected_samples,num_samples,start_date
atamali,0.190402,124.065,2015-08-26 19:20:00,178654,34016,2015-04-24 17:46:00
ayapo,0.232503,127.433,2015-08-27 22:17:00,183503,42665,2015-04-22 11:54:00
kensio,0.0893449,102.194,2015-08-21 22:57:00,147160,13148,2015-05-11 18:17:00


In [4]:
# Data samples present including reported on/off events

dd = {}
for rfd in wpa.raw_file_data[2:]:
    vname = rfd['village_name']
    energy_data = wpa.load_timeseries_file(vname + '-clean.csv')
    messages = wpa.load_message_file(vname + '-messages.csv')
    start_date, end_date = wpa.get_start_time(energy_data), wpa.get_end_time(energy_data)
    duration = (end_date - start_date) / np.timedelta64(1,'D')
    dd[vname] = {'start_date':start_date,
                 'end_date':end_date,
                 'duration':duration,
                'percent ts': wpa.create_uptime_boolean_timestamp(energy_data).mean(),
                'percent gaps': wpa.create_downtime_boolean_message(energy_data, messages).mean(),
                    'coverage':wpa.valid_coverage_percentage(energy_data, messages)}

stats = pd.DataFrame(dd).T[['duration',
                            'percent ts',
                            'percent gaps',
                            'coverage']]
stats

Unnamed: 0,duration,percent ts,percent gaps,coverage
atamali,124.065,0.1904,0.67276,0.863161
ayapo,127.433,0.232502,0.672051,0.904552
kensio,102.194,0.0893443,0.837226,0.92657


In [5]:
import tabulate
print(tabulate.tabulate(stats, tablefmt='pipe', 
      headers=['Village', 'Duration (days)', 'Percent data', 'Percent known downtime', 'Total Coverage']))

| Village   |   Duration (days) |   Percent data |   Percent known downtime |   Total Coverage |
|:----------|------------------:|---------------:|-------------------------:|-----------------:|
| atamali   |           124.065 |      0.1904    |                 0.67276  |         0.863161 |
| ayapo     |           127.433 |      0.232502  |                 0.672051 |         0.904552 |
| kensio    |           102.194 |      0.0893443 |                 0.837226 |         0.92657  |


In [6]:
with open('../tables/data_coverage.md', 'w') as f:
    f.write(tabulate.tabulate(stats, tablefmt='pipe', 
      headers=['Village', 'Start Date', 'End Date', 'Duration', 'Coverage']))