In [18]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta

In [7]:
global gas
global info_data

In [8]:
unprocessed_dir = '//datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//'
nan_info_dir = '//datc//opschaler//nan_information//'
paths_h = glob.glob(unprocessed_dir+'P*_hour.csv')
dwelling_ids = np.array(list((map(lambda x: x[-20::], paths_h))))
#paths_s = glob.glob(unprocessed_dir+'*_10s.csv')

In [11]:
def extract_info(dwelling_ids):
    
    info_data = pd.DataFrame(columns={'House_ID','Initial_Date','Final_Date','Days','Amount_of_NaNs','Max_Consecutive_NaNs','First_Gap_Valid_Data'})
    for dwelling in tqdm(dwelling_ids) :
                
        df=pd.read_csv(unprocessed_dir+dwelling, delimiter='\t', parse_dates=['datetime'])
        df = df.set_index(['datetime'])
        df_nan=df.gasMeter.isnull()
        temp = pd.DataFrame()
        temp['temp']=df_nan.groupby((df_nan != df_nan.shift()).cumsum()).transform('size') * df_nan
        max_gap=temp['temp'].max()
        
        nan_table=pd.read_csv(nan_info_dir+dwelling, delimiter='\t')
        nan_table=nan_table.loc[nan_table['Column name'] == 'gasMeter']
        nan_table.set_index('Unnamed: 0')
        first_big_nan_index=nan_table[nan_table['Amount of NaNs']>=5].index
        
        if first_big_nan_index.empty:
            valid_data='All_file_good'
        else :
            valid_data=datetime.strptime(nan_table['Start index'].loc[first_big_nan_index[0]],'%Y-%m-%d %H:%M:%S')-df.index[0]
            
        #Extract information about dates and number of NaNs
        initial_date_string= df.index[0].strftime('%Y-%m-%d %H:%M:%S')
        final_date_string= df.index[-1].strftime('%Y-%m-%d %H:%M:%S')
        days=df.index[-1]-df.index[0]
        NaN_count=df.gasMeter.isnull().sum()
        
        temp={'House_ID':dwelling,'Initial_Date':initial_date_string,'Final_Date':final_date_string,'Days':days,'Amount_of_NaNs':NaN_count,'Max_Consecutive_NaNs':max_gap,'First_Gap_Valid_Data':valid_data}
        #Append all information to the global info_data dataframe
        info_data = info_data.append(temp, ignore_index=True)
        #info_data.loc[len(info_data.index)] = [dwelling_name[-20:], initial_date_string, final_date_string, days, NaN_count, max_gap]
    return (info_data)

In [29]:
def drop_week_big_nan(dwelling_ids, gap_size):
    """
    Drop whole weeks containing NaN gaps bigger than the gap_size specified in the input
    :param df: Pandas DataDrame to process NaNs off
    :param df_nan_table: NaN info Pandas DataFrame of the input df
    :param gap_size: number of consecutive NaNs 
    :return: Pandas DataFrame
    """

    for dwelling in tqdm(dwelling_ids):
        
        nan_table=pd.read_csv(nan_info_dir+dwelling, delimiter='\t')
        nan_table=nan_table.loc[nan_table['Column name'] == 'gasMeter']
        big_nans_index=nan_table[nan_table['Amount of NaNs']>=gap_size].index
        print(big_nans_index)
        
        if big_nans_index.empty:
            pass
        else :
            
            for index in big_nans_index:
                dt=datetime.strptime(nan_table['Start index'].loc[index],'%Y-%m-%d %H:%M:%S')
                start = dt - timedelta(days=dt.weekday())
                start = start.replace(hour=00,minute=00,second=00)
                end = start + timedelta(days=6)
                print(start)
                print(end)

    """
    print('Dropping NaN streaks > threshold')
    l1 = len(df)
    df = df.drop(indices_to_drop)
    l2 = len(df)
    print('Removed %s rows' % (l1-l2))
    """
    
    return

In [30]:
drop_week_big_nan(dwelling_ids,5)

100%|██████████| 52/52 [00:00<00:00, 390.05it/s]

Int64Index([28], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([18, 19], dtype='int64')
2017-07-10 00:00:00
2017-07-16 00:00:00
2017-07-31 00:00:00
2017-08-06 00:00:00
Int64Index([26], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([26], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([12], dtype='int64')
2017-03-06 00:00:00
2017-03-12 00:00:00
Int64Index([28], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([26], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([32], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([32], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([], dtype='int64')
Int64Index([32, 35], dtype='int64')
2017-03-13 00:00:00
2017-03-19 00:00:00
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([29], dtype='int64')
2017-04-03 00:00:00
2017-04-09 00:00:00
Int64Index([13], dtype='int64')
2017-07-10 00:00:00
2017-07-16 00:00:00
Int64Index([], dty




In [12]:
hourly_nans=extract_info(dwelling_ids)
hourly_nans.to_csv('Hour_NaN_Gas_Table.csv', sep='\t', index=True,na_rep='NA')
#Ten_secs_nans=extract_info(paths_s)
#Ten_secs_nans.to_csv('Secs_NaN_Gas_Table.csv', sep='\t', index=True)

100%|██████████| 52/52 [00:01<00:00, 46.65it/s]
