In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from utils import *
from datetime import datetime
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

keys = ['采集时间', '水温', 'pH', '溶解氧', '电导率', '浊度', '高锰酸盐指数','氨氮', '总磷', '总氮']
en_keys = ['WaterTemperature', 'PH' ,'dissolved oxygen', 'Conductivity','Turbidity','PermanganateIndex',
        'AmmoniaNitrogen','TP','TN', 'humidity','room temperature','chlorophyll','Algae density']
# limits = [(5, 30), (5.0, 9), (1, 15), (50, 500), (0, 1500), (0, 15), (0, 0.5), (0, 0.3), (0, 5)]

np.set_printoptions(formatter = {'float': '{:.2f}'.format})


In [15]:
a = datetime.strptime('2021-01-01', '%Y-%m-%d') - timedelta(days=3)
df = data_factory('./original_data/05-涪江/涪江水质断面水质-小时尺度/两河审核数据查询表.xls',
                  3, 5, 3)
df.loc[str(a): '2021-01-01'].values

array([[1.05e+01, 8.19e+00, 9.92e+00, 4.88e+02, 2.65e+00, 2.91e+00,
        6.33e-02, 3.68e-02, 1.46e+00],
       [1.05e+01, 8.19e+00, 9.93e+00, 4.86e+02, 2.57e+00, 2.91e+00,
        6.24e-02, 3.61e-02, 1.45e+00],
       [1.05e+01, 8.20e+00, 9.95e+00, 4.84e+02, 2.57e+00, 2.90e+00,
        6.26e-02, 3.56e-02, 1.45e+00],
       [1.05e+01, 8.21e+00, 1.00e+01, 4.83e+02, 2.56e+00, 2.90e+00,
        6.19e-02, 3.54e-02, 1.45e+00],
       [1.05e+01, 8.22e+00, 1.01e+01, 4.82e+02, 2.60e+00, 2.95e+00,
        6.12e-02, 3.51e-02, 1.44e+00],
       [1.05e+01, 8.23e+00, 1.01e+01, 4.81e+02, 2.62e+00, 2.94e+00,
        6.04e-02, 3.48e-02, 1.45e+00],
       [1.05e+01, 8.23e+00, 1.01e+01, 4.81e+02, 2.54e+00, 2.96e+00,
        6.01e-02, 3.46e-02, 1.44e+00],
       [1.05e+01, 8.23e+00, 1.00e+01, 4.80e+02, 2.51e+00, 3.00e+00,
        6.03e-02, 3.45e-02, 1.43e+00],
       [1.04e+01, 8.22e+00, 9.98e+00, 4.81e+02, 2.60e+00, 2.94e+00,
        6.02e-02, 3.43e-02, 1.45e+00],
       [1.04e+01, 8.22e+00, 9.98e+00,

In [43]:
import torch
from trainer import *
from models import *


def preHandler(ckpt_path, df_path, pre_start_date, model_kwargs, describe_df_path, read_method, *args):
    '''
    Make Sure the excel file has no nan value.
    '''
    df = read_method(df_path, *args)
    describe = pd.read_csv(describe_df_path, index_col=0)
    pre_start_date = datetime.strptime(pre_start_date, '%Y-%m-%d')
    end_date = str(pre_start_date - timedelta(days=1))
    start_date = str(pre_start_date - timedelta(days=4))
    vals = df.loc[start_date:end_date].values[:-1]
    if np.isnan(vals).any() or vals.shape[0] != model_kwargs['lGet']:
        print('Attention!')
        print(f'The excel file has not enough data during {start_date} and {end_date}.')
        print(f'Please try another date or change the excel file')
        return 
    
    means = describe.loc['mean'].values.reshape(1, 1, 9)
    stds = describe.loc['std'].values.reshape(1, 1, 9)
    descaler = lambda x: x * stds + means
    scaler = lambda x: (x - means) / stds
    
    model = SCIModule(**model_kwargs)
    vals = vals.reshape(1, -1, 9)
    vals = scaler(vals)
    pre = prediction(ckpt_path, model, vals, model_kwargs['lPre'], model_kwargs['lGet'])
    pre = descaler(pre.transpose())
    return pre


def prediction(ckpt_path, model, data, lPre=42, lGet=84):
    ckpt = torch.load(ckpt_path)
    model.load_state_dict(ckpt['state_dict'])
    model.freeze()

    data = data.reshape(1, 9, lGet)
    x = torch.from_numpy(data).to(torch.float32)
    y = model(x)
    y = y.numpy().squeeze()
    return y

kwargs = {'features': 9,
        'lPre': 6,
        'lGet' : 18,
        'Tree_levels':2,
        'hidden_size_rate':6,
        'loss':F.l1_loss,
        'lr':1e-3,
        'descaler':None}
    
pre = preHandler('./lightning_logs/fujiang_all/checkpoints/last.ckpt', 
                './original_data/05-涪江/涪江水质断面水质-小时尺度/两河审核数据查询表.xls',
                '2021-04-12', kwargs, 
                './all_data/fujiang_1d/all_describe.csv', data_factory, 3, 5, 3)
pre

array([[[12.34, 7.85, 6.70, 313.85, -0.33, 1.05, 0.01, 0.02, 0.47],
        [13.52, 7.91, 6.99, 327.65, 1.50, 1.11, 0.03, 0.02, 0.55],
        [15.29, 7.97, 7.57, 354.33, 5.67, 1.36, 0.05, 0.04, 0.82],
        [17.71, 8.02, 8.33, 388.79, 11.36, 1.83, 0.09, 0.05, 1.28],
        [20.23, 8.08, 9.01, 418.58, 16.48, 2.35, 0.13, 0.07, 1.78],
        [22.27, 8.14, 9.47, 439.33, 19.82, 2.76, 0.15, 0.08, 2.15]]])

In [None]:
lPre, lGet = 42, 84
features = data.shape[1]
l, h = 25, 3

window = 8000
index = x[window:]
real = data[window:, :]
prediction = pre[window:, :]

def plot(x, data, pre, area=9000, lGet=84):
    fig, axis = plt.subplots(features, 1, figsize=(l, h*features), constrained_layout=True)

    for i in range(features):
        name = en_keys[i]
        axis[i].plot(x[:], data[:, i], '-k', linewidth=3)
        axis[i].plot(x[lGet:area+lGet], pre[:area, i], '-r', linewidth=0.8)
        axis[i].plot(x[area+lGet:], pre[area:, i], '-b', linewidth=0.8)

    #     df.plot(y=k, ax=axis[i], style='-k')
    #     df.plot(y=f'{k}(pre)', ax=axis[i], style='--r')

        axis[i].set_title(name, fontsize=20)
        axis[i].set_xlabel('', fontsize=15)
        axis[i].set_ylabel('', fontsize=15)

        axis[i].legend([name], fontsize=15)
    
plot(index, real, prediction, 9000-window)

In [32]:
# !!! Do Not remove !!!
# Data Handler for BianJie, FuJiang, luguhu.
dataHandler('./original_data/04-四川省边界断面/四川边界断面水质数据/', 30, 6, './all_data/bianjie_1d/', data_factory, 3, 5, 3)
dataHandler('./original_data/泸沽湖邛海鲁班水库水质数据/原始查询/', 30, 6, './all_data/luguhu_1d/', data_factory, 3, 5, 3)
dataHandler('./original_data/05-涪江/涪江水质断面水质-小时尺度/', 30, 6, './all_data/fujiang_1d//', data_factory, 3, 5, 3)

19it [00:17,  1.06it/s]
4it [00:04,  1.15s/it]
15it [00:11,  1.33it/s]


In [4]:
# !!! Do Not remove !!!
# Data Handler for MinTuoJiang data
import re
from tqdm import tqdm
from utils import _gen_data
lPre, lGet = 6, 18
r, limit, step = 3, 5, 3
save_path = './all_data/mtj_1d/'
ptj_keys = ['监测时间', '水温(℃)', 'pH值(无量纲)', '溶解氧(mg/L)', '电导率(uS/cm)', '浊度(NTU)','高锰酸盐指数(mg/L)',
           '氨氮(mg/L)', '总磷(mg/L)', '总氮(mg/L)']
p = Path('./original_data/08-岷沱江数据/岷沱江水质监测数据/')
d = {}
for file in p.iterdir():
    name = re.match('\d*?\D+', file.stem)[0]
    if name in d.keys():
        d[name].append(file)
    else:
        d[name] = [file]
all_df = []
all_data = []
for k in tqdm(d.keys()):
    dfs = []
    for file in d[k]:
        df = pd.read_excel(file, header=1, usecols=ptj_keys, index_col=0, dtype=str)
        df.drop(df.index[0], axis=0, inplace=True)
        dfs.append(df)    
    df = pd.concat(dfs)
    df = df.sort_index()
    df.index=pd.to_datetime(df.index, format='%Y-%m-%d %H:%M:%S')
    for f in df.keys():
        df[f] = df[f].str.extract('(^\d+\.\d+)')
    df = df.astype('float64')
    df = df.resample('4H').mean()
    df.loc[(df==0).all(axis=1)] = np.nan    
    df = remove_outliers(df, standard_deviation, 25)
    df[df < 0] = np.nan
    df = patch_up(df, r, limit)
    df = smooth(df, step)
    
    all_df.append(df)
    
    save_file_name = f'{save_path}{k}'
    describe_save_name = f'{save_file_name}_describe.csv'
    x = _gen_data(df, lGet, lPre, save_file_name)
    all_data.append(x)
    df.describe().to_csv(describe_save_name)

all_data = np.vstack(all_data)
np.save(f'{save_path}all',all_data)
pd.concat(all_df).describe().to_csv(f'{save_path}all_describe.csv')
    

100%|█████████████████████████████████████████| 50/50 [01:08<00:00,  1.38s/it]


In [3]:
dic = {'./original_data/04-四川省边界断面/四川边界断面水质数据/':'./all_data/bianjie_1d/', 
       './original_data/05-涪江/涪江水质断面水质-小时尺度/':'./all_data/fujiang_1d/', 
       './original_data/泸沽湖邛海鲁班水库水质数据/原始查询/':'./all_data/luguhu_1d/',}
def merge_all(path_dict, lGet, lPre):
    for path in path_dict.keys():
        p = Path(path)
        save_path = path_dict[path]
        nouse = [f.unlink() for f in Path(save_path).iterdir() if f.is_file()]
        dfs = []
        data = []
        print(p)
        for file in p.iterdir():
            dfs.append(data_factory(file, 3, 5, 3))
            save_file_name = f'{save_path}{file.stem}'
            describe_save_name = f'{save_path}{file.stem}_describe.csv'
            data.append(_gen_data(df, lGet, lPre, save_file_name))
            df.describe().to_csv(describe_save_name)
        
        describe = pd.concat(dfs).describe()
        describe.to_csv(f'{save_path}all_describe.csv')
        np.save(f'{save_path}all', np.vstack(data, ))
    return 
merge_all(dic, 18, 6)

original_data/04-四川省边界断面/四川边界断面水质数据
original_data/05-涪江/涪江水质断面水质-小时尺度
original_data/泸沽湖邛海鲁班水库水质数据/原始查询


In [87]:
dic = {'./original_data/04-四川省边界断面/四川边界断面水质数据/':'./all_data/bianjie/', 
       './original_data/05-涪江/涪江水质断面水质-小时尺度/':'./all_data/fujiang/', 
       './original_data/泸沽湖邛海鲁班水库水质数据/原始查询/':'./all_data/luguhu/',}

def save_all_df()

Unnamed: 0,水温,pH,溶解氧,电导率,浊度,高锰酸盐指数,氨氮,总磷,总氮
count,36560.0,36369.0,36143.0,35884.0,35455.0,35819.0,35842.0,35996.0,36129.0
mean,18.078818,8.19275,8.024069,264.660612,3.965766,1.375465,0.026059,0.013538,0.3154992
std,5.782753,0.428694,1.279147,46.232762,2.231351,0.616139,0.025871,0.01097,0.2313502
min,1.452317,6.380308,4.0615,147.94076,0.001934,0.0,0.0,0.0,7.733272e-25
25%,13.052193,7.901154,7.224828,227.91062,2.101543,0.978614,0.007168,0.006177,0.142429
50%,18.066469,8.196411,7.935374,246.556867,3.674533,1.365982,0.017047,0.01155,0.2676053
75%,21.917541,8.469104,8.724855,302.771615,5.393889,1.737244,0.036907,0.017185,0.4274986
max,41.526181,9.677476,13.417636,385.149342,13.396251,3.992529,0.156455,0.062715,1.332
