In [1]:
import numpy as np
import matplotlib.colors as colors
import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd

import sklearn
import scipy
import torch

import sys
import os
import pathlib
import itertools
import glob
import re
import datetime
import pickle

from itertools import groupby
from operator import itemgetter
from copy import deepcopy


In [2]:
pt1 = os.path.join("/projects", "emco4286", 'data', "gads", "trajectories", "ct", "long")

def glob_re(pattern, strings):
    return list(filter(re.compile(pattern).match, strings))

filenames = glob_re(r"gen_\d+_class_CT_rating_\d+_state_Texas.csv", os.listdir(pt1))

def consecutive_groups(iterable, ordering=lambda x: x):
    for k, g in groupby(enumerate(iterable), key=lambda x: x[0] - ordering(x[1])):
        yield map(itemgetter(1), g)

states_new = []
features_new = []
lengths_new = []

columns = ['y3', 'ERCOT', 'y4', 'Pcp', 'Tmax', 'Tmin', 'y6', 'y8', 'y7', 'y9']

print("Creating sequences")
for f in filenames:

    id = re.findall(r"gen_(\d+)", f)[0]

    data = pd.read_csv(os.path.join(pt1,f))
    data.set_index(pd.DatetimeIndex(data["x"]), inplace=True)
    data = data[~data.index.duplicated()]
    original_length = len(data)

    for c in columns:
        data = data[~data[c].isna()]

    print(f"{id}: {len(data)}, {100*np.round(len(data)/original_length, 2)}\%")

    if len(data) < 2:
        continue

    start = data.index[0]
    diffs = np.diff((data.index))
    hours = np.cumsum(diffs/np.timedelta64(1, 'h')).astype(int) - 1

    sequences = []
    for g in consecutive_groups(hours):
        sequences.append(list(g))

    for k in sequences:

        dts = pd.DatetimeIndex(start + np.array([datetime.timedelta(hours=int(h)) for h in k]))
        index = data.loc[dts[0]:dts[-1], :].index
        states = data.loc[index, "y2"].values.tolist()
        features = data.loc[index, columns].values
        l = len(k)

        states_new += [states]
        features_new += [features]

        
        lengths_new += [l]



Creating sequences
10551: 88306, 92.0\%
1662: 88813, 92.0\%
12814: 1366, 1.0\%
10650: 86443, 90.0\%
5872: 85283, 88.0\%
7661: 0, 0.0\%
12700: 12464, 13.0\%
2247: 92392, 96.0\%
12697: 17758, 18.0\%
5010: 93642, 97.0\%
5865: 82834, 86.0\%
1663: 88672, 92.0\%
12693: 14948, 16.0\%
11953: 59577, 62.0\%
12289: 40108, 42.0\%
10266: 87666, 91.0\%
12226: 43510, 45.0\%
3365: 94909, 98.0\%
12757: 0, 0.0\%
12701: 17760, 18.0\%
2241: 87805, 91.0\%
12860: 0, 0.0\%
4744: 89809, 93.0\%
5034: 83972, 87.0\%
4962: 82364, 85.0\%
12227: 18307, 19.0\%
1127: 88905, 92.0\%
12824: 0, 0.0\%
10270: 88647, 92.0\%
5867: 88696, 92.0\%
5035: 91291, 95.0\%
12775: 0, 0.0\%
12698: 12487, 13.0\%
10271: 86142, 89.0\%
5871: 88359, 92.0\%
2242: 87804, 91.0\%
10652: 90755, 94.0\%
10083: 79936, 83.0\%
12702: 17758, 18.0\%
11387: 86507, 90.0\%
10161: 73064, 76.0\%
12504: 35776, 37.0\%
12116: 48675, 50.0\%
5008: 89109, 92.0\%
12565: 31393, 33.0\%
12694: 10101, 10.0\%
12836: 0, 0.0\%
11132: 94044, 98.0\%
3367: 88389, 92.0\%
336

: 

: 

In [4]:

states = np.concatenate(states_new).astype(int)
states -= 1
features = np.vstack(features_new)
lengths = np.array(lengths_new)

print("Created sequences \n")


: 

: 

In [10]:
pt2 = os.path.join("/projects", "emco4286", 'data', "gads", "trajectories", "ct", "preloaded")
np.save(os.path.join(pt2, "states.npy"), states)
np.save(os.path.join(pt2, "features.npy"), features)
np.save(os.path.join(pt1, "preloaded", "lengths.npy"), lengths)