In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta

In [2]:
global gas
global info_data

In [3]:
unprocessed_dir = '//datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//'
nan_info_dir = '//datc//opschaler//nan_information//'
paths_h = glob.glob(unprocessed_dir+'P*_hour.csv') #Getting all files matching that path + expression 
dwelling_ids = np.array(list((map(lambda x: x[-20::], paths_h))))
#paths_s = glob.glob(unprocessed_dir+'*_10s.csv')

In [4]:
#This code is used to extract information about gaps size, location of first big nan gap 
#(useful for periodicity porpouse and identify problems in acquisition system)
def extract_info(dwelling_ids):
    
    info_data = pd.DataFrame(columns={'House_ID','Initial_Date','Final_Date','Days','Amount_of_NaNs','Max_Consecutive_NaNs','First_Gap_Valid_Data'})
    for dwelling in tqdm(dwelling_ids) : #Loop over all dwellings
                
        df=pd.read_csv(unprocessed_dir+dwelling, delimiter='\t', parse_dates=['datetime'])
        df = df.set_index(['datetime'])
        df_nan=df.gasMeter.isnull()
        temp = pd.DataFrame()
        temp['temp']=df_nan.groupby((df_nan != df_nan.shift()).cumsum()).transform('size') * df_nan 
        max_gap=temp['temp'].max()
 
        nan_table=pd.read_csv(nan_info_dir+dwelling, delimiter='\t')
        nan_table=nan_table.loc[nan_table['Column name'] == 'gasMeter']
        nan_table.set_index('Unnamed: 0')
        first_big_nan_index=nan_table[nan_table['Amount of NaNs']>=5].index
        
        if first_big_nan_index.empty:
            valid_data='All_file_good'
        else :
            valid_data=datetime.strptime(nan_table['Start index'].loc[first_big_nan_index[0]],'%Y-%m-%d %H:%M:%S')-df.index[0]
            
        #Extract information about dates and number of NaNs
        initial_date_string= df.index[0].strftime('%Y-%m-%d %H:%M:%S')
        final_date_string= df.index[-1].strftime('%Y-%m-%d %H:%M:%S')
        days=df.index[-1]-df.index[0]
        NaN_count=df.gasMeter.isnull().sum()
        
        temp={'House_ID':dwelling,'Initial_Date':initial_date_string,'Final_Date':final_date_string,'Days':days,'Amount_of_NaNs':NaN_count,'Max_Consecutive_NaNs':max_gap,'First_Gap_Valid_Data':valid_data}
        #Append all information to the global info_data dataframe
        info_data = info_data.append(temp, ignore_index=True)
        #info_data.loc[len(info_data.index)] = [dwelling_name[-20:], initial_date_string, final_date_string, days, NaN_count, max_gap]
    return (info_data)

In [5]:
#This part is a combination with Brian de Keijzer notebook on removing NaNs and personalcode
def drop_week_big_nan(dwelling_ids, gap_size):
    """
    Drop whole weeks containing NaN gaps bigger than the gap_size specified in the input
    :param df: Pandas DataDrame to process NaNs off
    :param df_nan_table: NaN info Pandas DataFrame of the input df
    :param gap_size: number of consecutive NaNs 
    :return: Pandas DataFrame
    """

    for dwelling in tqdm(dwelling_ids):
        
        nan_table=pd.read_csv(nan_info_dir+dwelling, delimiter='\t')
        nan_table=nan_table.loc[nan_table['Column name'] == 'gasMeter']
        big_nans_index=nan_table[nan_table['Amount of NaNs']>=gap_size].index
        
        df = pd.read_csv(unprocessed_dir+dwelling, delimiter='\t', parse_dates=['datetime'])
        df = df.set_index(['datetime'])
        #print(big_nans_index)
        
        
        if big_nans_index.empty:
            pass
        else :
            
            for index in big_nans_index:
                dt=datetime.strptime(nan_table['Start index'].loc[index],'%Y-%m-%d %H:%M:%S')
                start = dt - timedelta(days=dt.weekday())
                start.replace(hour=0,minute=0,second=0)
                start=start.date()
                end = start + timedelta(days=7)             
                start=df.index.searchsorted(start)
                end=df.index.searchsorted(end)
                
                df.drop(df.index[start:end],inplace=True)
            
        dir = '//datc//opschaler//combined_gas_smart_weather_dfs//processed//Weeks_gas_drop//'
        df.to_csv(dir +'weeks_removed_'+ dwelling, sep='\t', index=True)  
 
    return

In [6]:
drop_week_big_nan(dwelling_ids,5)

100%|██████████| 52/52 [00:03<00:00, 16.62it/s]


In [7]:
hourly_nans=extract_info(dwelling_ids)
print(hourly_nans)
hourly_nans.to_csv('Hour_NaN_Gas_Table.csv', sep='\t', index=True,na_rep='NA')
#Ten_secs_nans=extract_info(paths_s)
#Ten_secs_nans.to_csv('Secs_NaN_Gas_Table.csv', sep='\t', index=True)

100%|██████████| 52/52 [00:01<00:00, 45.98it/s]


   First_Gap_Valid_Data           Final_Date              House_ID  \
0      28 days 03:00:00  2017-05-30 13:00:00  P01S01W7548_hour.csv   
1      15 days 03:00:00  2017-11-21 18:00:00  P01S02W0167_hour.csv   
2      28 days 11:00:00  2017-05-30 10:00:00  P01S01W5040_hour.csv   
3      28 days 11:00:00  2017-05-29 13:00:00  P01S01W8669_hour.csv   
4       0 days 18:00:00  2017-03-09 02:00:00  P01S01W0000_hour.csv   
5      19 days 10:00:00  2017-06-09 10:00:00  P01S01W9617_hour.csv   
6      28 days 11:00:00  2017-05-31 11:00:00  P01S01W5588_hour.csv   
7      19 days 10:00:00  2017-05-29 16:00:00  P01S01W9431_hour.csv   
8      28 days 11:00:00  2017-05-29 12:00:00  P01S01W4002_hour.csv   
9         All_file_good  2017-06-02 14:00:00  P01S01W7042_hour.csv   
10      7 days 14:00:00  2017-06-01 13:00:00  P01S01W6289_hour.csv   
11     28 days 11:00:00  2017-05-29 18:00:00  P01S01W5476_hour.csv   
12     17 days 00:00:00  2017-11-21 20:00:00  P01S02W4827_hour.csv   
13        All_file_g