In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from utils import *

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

keys = ['采集时间', '水温', 'pH', '溶解氧', '电导率', '浊度', '高锰酸盐指数','氨氮', '总磷', '总氮']
en_keys = ['WaterTemperature', 'PH' ,'dissolved oxygen', 'Conductivity','Turbidity','PermanganateIndex',
        'AmmoniaNitrogen','TP','TN', 'humidity','room temperature','chlorophyll','Algae density']
# limits = [(5, 30), (5.0, 9), (1, 15), (50, 500), (0, 1500), (0, 15), (0, 0.5), (0, 0.3), (0, 5)]

np.set_printoptions(formatter = {'float': '{:.2e}'.format})


In [None]:
import torch
from trainer import *
from models import *

def test(ckpt_path, model, data, lPre=42, lGet=84, step=3):
    ckpt = torch.load(ckpt_path)
#     ckpt = torch.load(ckpt_path, map_location='cuda:0')

#     model.load_state_dict(ckpt, strict=False)
    model.load_state_dict(ckpt['state_dict'])
    model.freeze()
    
    l, f = data.shape
    data = np.transpose(data)
    pre = np.zeros((l-lGet, f))
    data = torch.from_numpy(data).to(torch.float32)
    data = data[None, ...]
    print(data.shape)
    max_idx = l - step - lGet
    
    i = 0
    while i <= max_idx:
        x = data[:, :, i: i + lGet]
        y = model(x)
        y = y.numpy().squeeze()
        pre[i : i + step] = y[:, :step].transpose()
        i += step
        
    return pre

kwargs = {'features': 9,
        'lPre': 42,
        'lGet' : 84,
        'Tree_levels':2,
        'hidden_size_rate':4,
        'loss':F.l1_loss,
        'lr':9e-3,
         'descaler':None}
    
model = SCIModule(**kwargs)
# model = SCINet(9, 42, 84, 2, 4)
with torch.no_grad():
    pre = test('./lightning_logs/7d/checkpoints/last.ckpt', model, df.values, lPre=42, lGet=84, step=3)

In [None]:
lPre, lGet = 42, 84
features = data.shape[1]
l, h = 25, 3

window = 8000
index = x[window:]
real = data[window:, :]
prediction = pre[window:, :]

def plot(x, data, pre, area=9000, lGet=84):
    fig, axis = plt.subplots(features, 1, figsize=(l, h*features), constrained_layout=True)

    for i in range(features):
        name = en_keys[i]
        axis[i].plot(x[:], data[:, i], '-k', linewidth=3)
        axis[i].plot(x[lGet:area+lGet], pre[:area, i], '-r', linewidth=0.8)
        axis[i].plot(x[area+lGet:], pre[area:, i], '-b', linewidth=0.8)

    #     df.plot(y=k, ax=axis[i], style='-k')
    #     df.plot(y=f'{k}(pre)', ax=axis[i], style='--r')

        axis[i].set_title(name, fontsize=20)
        axis[i].set_xlabel('', fontsize=15)
        axis[i].set_ylabel('', fontsize=15)

        axis[i].legend([name], fontsize=15)
    
plot(index, real, prediction, 9000-window)

In [2]:
df = fujiang_factory('./original_data/05-涪江/涪江水质断面水质-小时尺度/元坝子审核数据查询表.xls', 3, 5, 3)

In [4]:
def _gen_data(df, lGet, lPre, save_path=''):
    '''
    Parameters:
        df: The DataFrame came from data factory.
        lGet: How long old data you need.
        lPre: How long new data you predicted.
        save_path: Where to save the .npz file.
    '''
    step = lGet + lPre
    data = []
    for i in range(df.shape[0]-step):
        vals = df.iloc[i: i+step].values
        if (vals != np.nan).all():
            data.append(vals)
    data = np.stack(data, axis=0)
    if save_path:
        np.save(save_path, data)
    return np.stack(data, axis=0)
# data = gen_data(df, 12, 6, '')

In [8]:
def dataHandler(path, lGet, lPre, save_path, func, *args):
    p = Path(path)
    for file in p.iterdir():
        print(file.stem)
        save_file_name = f'{save_path}{file.stem}'
        describe_save_name = f'{save_path}{file.stem}_describe.csv'
        df = func(file, *args)
        _gen_data(df, lGet, lPre, save_file_name)
        df.describe().to_csv(describe_save_name)
    return 
dataHandler('./original_data/泸沽湖邛海鲁班水库水质数据/原始查询/', 12, 6, './all_data/luguhu_1d/', fujiang_factory, 3, 5, 3)

原始查询（礼板湾(王妃岛)-泸沽湖）
原始查询（邛海湖心-邛海）
原始查询（鲁班岛-鲁班水库）
原始查询（泸沽湖湖心-泸沽湖）


In [None]:
def mtj_datahandler(path, lGet, lPre, save_path):
    p = Path(path)
    # First Merge data
    d = {}
    for file in p.iterdir():
        df = pd.read_ex
        name = re.match('\d*?\D+', file.stem)[0]
        if name in d.keys():
            d[name].append(df)
        else:
            d[name] = [df]     
    
    for name in d.keys():
        

In [138]:
# !!! Do Not remove !!!
# Data Handler for MinTuoJiang data
import re
from tqdm import tqdm
lPre, lGet = 6, 12
r, limit, step = 3, 5, 3
save_path = './all_data/mtj_1d/'
ptj_keys = ['监测时间', '水温(℃)', 'pH值(无量纲)', '溶解氧(mg/L)', '电导率(uS/cm)', '浊度(NTU)','高锰酸盐指数(mg/L)',
           '氨氮(mg/L)', '总磷(mg/L)', '总氮(mg/L)']
p = Path('./original_data/08-岷沱江数据/岷沱江水质监测数据/')
d = {}
for file in p.iterdir():
    name = re.match('\d*?\D+', file.stem)[0]
    if name in d.keys():
        d[name].append(file)
    else:
        d[name] = [file]
for k in tqdm(d.keys()):
    dfs = []
    for file in d[k]:
        df = pd.read_excel(file, header=1, usecols=ptj_keys, index_col=0, dtype=str)
        df.drop(df.index[0], axis=0, inplace=True)
        dfs.append(df)    
    df = pd.concat(dfs)
    df = df.sort_index()
    df.index=pd.to_datetime(df.index, format='%Y-%m-%d %H:%M:%S')
    for f in df.keys():
        df[f] = df[f].str.extract('(^\d+\.\d+)')
    df = df.astype('float64')
    df = df.resample('4H').mean()
    df.loc[(df==0).all(axis=1)] = np.nan    
    df = remove_outliers(df, standard_deviation, 25)
    df[df < 0] = np.nan
    df = patch_up(df, r, limit)
    df = smooth(df, step)
    
    save_file_name = f'{save_path}{k}'
    describe_save_name = f'{save_file_name}_describe.csv'
    _gen_data(df, lGet, lPre, save_file_name)
    df.describe().to_csv(describe_save_name)

100%|██████████████████████████████████████████████████████████████████| 50/50 [01:18<00:00,  1.57s/it]


In [134]:
df.head(20)

Unnamed: 0_level_0,水温(℃),pH值(无量纲),溶解氧(mg/L),电导率(uS/cm),浊度(NTU),高锰酸盐指数(mg/L),氨氮(mg/L),总磷(mg/L),总氮(mg/L)
监测时间,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-12-06 00:00:00,13.28988,7.695774,6.453959,0.384313,23.385815,3.523625,0.3625,0.505371,0.13
2004-12-06 04:00:00,13.2511,7.697159,6.485958,0.385192,22.900705,3.523625,0.398214,0.505371,0.13
2004-12-06 08:00:00,13.216263,7.702435,6.566043,0.386658,22.299562,3.523625,0.403311,0.505371,0.13
2004-12-06 12:00:00,13.178106,7.71496,6.667822,0.38765,21.590485,3.523625,0.413986,0.505371,0.13
2004-12-06 16:00:00,13.236998,7.731252,6.70999,0.389108,21.759259,3.523625,0.42743,0.505371,0.13
2004-12-06 20:00:00,13.134915,7.756926,7.144196,0.301179,78.550143,3.523625,0.441897,0.505371,0.13
2004-12-07 00:00:00,12.858599,7.696627,7.598445,0.214691,143.46324,3.523625,0.448563,0.505371,0.13
2004-12-07 04:00:00,12.538889,7.621463,7.933952,0.155438,187.845251,3.523625,0.437155,0.505371,0.13
2004-12-07 08:00:00,12.131225,7.581894,8.215583,0.113614,219.036874,3.523625,0.424409,0.505371,0.13
2004-12-07 12:00:00,11.974672,7.535401,8.374847,5.050923,241.336454,3.523625,0.423241,0.505371,0.13


In [132]:
p = './original_data/08-岷沱江数据/岷沱江水质监测数据/宏缘2019-01-02 00_00_00-2022-10-24 23_59_59数据列表.xls'
df = pd.read_excel(p, header=1, index_col=0, usecols=ptj_keys, dtype=str)
df = df.drop(df.index[0], axis=0)
try:
    df = df.astype('float64')
except ValueError:
    for k in df.keys():
        df[k] = df[k].str.extract('(^\d+\.\d+)')
    df = df.astype('float64')
df
# df.keys()

Unnamed: 0_level_0,水温(℃),pH值(无量纲),溶解氧(mg/L),电导率(uS/cm),浊度(NTU),高锰酸盐指数(mg/L),氨氮(mg/L),总磷(mg/L),总氮(mg/L)
监测时间,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-24 14:00:00,20.46,8.199423,8.650218,494.8470,16.41875,,,,
2022-10-24 13:00:00,20.38,8.172537,8.498997,492.9435,14.14960,,,,
2022-10-24 12:00:00,20.34,8.152936,8.315359,488.7164,15.74067,0.692517,0.0285,0.081523,2.11597
2022-10-24 11:00:00,20.13,8.133830,8.205398,491.9583,13.69998,,,,
2022-10-24 10:00:00,20.10,8.121502,8.093015,490.7560,15.19851,,,,
...,...,...,...,...,...,...,...,...,...
2019-01-02 04:00:00,7.70,8.910000,12.410000,472.8000,17.10000,1.660000,0.5800,,3.23000
2019-01-02 03:00:00,7.60,8.950000,12.440000,473.6000,17.50000,,,,
2019-01-02 02:00:00,7.70,8.960000,12.510000,474.1000,18.20000,,,,
2019-01-02 01:00:00,7.70,9.000000,12.530000,474.4000,22.10000,,,,


In [149]:
p = Path('./all_data/luguhu_1d/')
for f in p.iterdir():
    name = f.name
    name = name.replace('fujiang_1d', '')
    f.rename(Path(f'{str(p)}/{name}'))
    
#     print(name)