# Unprocessed dwelling information extractor
This notebook extracts usefull NaN information per dwelling and saves this into one file (csv & excel.  
It will this for both the 10s and hour sample rate dataframes.  
The unprocessed dataframes are loaded from: `//datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//`  
The nan information per dwelling is loaded from: `//datc//opschaler//nan_information//`  
The final product is saved in: `//datc//opschaler//dwelling_information//total_information//`  
It contains the following information per dwelling, for both the 10s and one hour sample rate:  
* dwelling id  
Dwelling id of the house.
  
* recorded days  
The length of the unprocessed dataframe in days.

* start date  
The start date of the unprocessed dataframe.


* stop date  
The stop date of the unprocessed dataframe.

* total samples (per columnm in thousands)  
Total amount of samples in thousands, per column.
  
* total NaN streaks  
Total amount of NaN streaks.
  
* total NaN streaks > 2  
Total amount of NaN streaks which are larger than 2.
  
* total NaNs [-]  
Total amount of NaNs in the unprocessed dataframe.
  
* total NaNs [%]  
Totals NaNs devided by the total samples.
  
* mean of NaNs  
Mean of the amount of NaNs per NaN streak.
  
* median of NaNs  
Median of the amount of NaNs per NaN streak.
  
* std of NaNs  
Standard deviation of the amount of NaNs per NaN streak.
  
* first highest NaN streak (%)  
Amount of NaNs from the first highest NaN streak, devided by the total samples.
  
* first highest NaN streak column  
The name of the column where the first highest NaN streak is in.
  
* second highest NaN streak (%)  
Amount of NaNs from the first highest NaN streak, devided by the total samples.
  
* second highest NaN streak column  
The name of the column where the first highest NaN streak is in.
  
* third highest NaN streak (%)  
Amount of NaNs from the first highest NaN streak, devided by the total samples.
  
* third highest NaN streak column  
The name of the column where the first highest NaN streak is in.
  

# Imports

In [1]:
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

# Function definitions

In [2]:
def unprocessed_length_in_days(dwelling_id, type_):
    """
    Get the total amount of days of the unprocessed dwelling_id.
    """
    dir = '//datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//'
    df = pd.read_csv(dir+dwelling_id+'_'+type_+'.csv', delimiter='\t', parse_dates=['datetime'])
    columns = df.columns
    df = df['datetime'] # only keep the datetime column
    start_date = df.iloc[0]
    stop_date = df.iloc[-1]
    
    del df # Free up memory
    
    recorded_days = (stop_date - start_date).days # total amount of recorded days

    return recorded_days, start_date, stop_date, columns


def nan_information_extractor(dwelling_id, path, type_):
    """
    Extracts usefull information from the nan info table from a dwelling id. 
    Output is a list with this information.
    """
    df = pd.read_csv(path, delimiter='\t')
    df = df.sort_values(by=['Amount of NaNs'], ascending=False) # Sort from highest to lowest amount of NaNs
    
    recorded_days, start_date, stop_date, columns = unprocessed_length_in_days(dwelling_id, type_) # Length of unprocessed dataframe in days
    
    if df.empty: # If df is empty, return nothins
        #print('Dataframe is empty: %s' % path)
        result = list(np.full(15, np.NaN)) # Make all outputs NaN
        result[0] = dwelling_id
        result[1] = recorded_days
        return result
    else:

        if type_ == 'hour':
            length = recorded_days*24
        elif type_ == '10s':
            length = recorded_days*24*60*6
        else: 
            print('type_ must be \'hour\' or \'10s')
            
        if length == 0:
            result = list(np.full(15, np.NaN)) # Make all outputs NaN
            result[0] = dwelling_id
            result[1] = recorded_days
            return result
    
        # Calculate usefull information
        #total_samples = length*len(columns) # get the total amount of samples in the complete df
        total_samples = length
        total_gaps = len(df['Amount of NaNs'])
        total_gaps_larger_than_2 = len(df[df['Amount of NaNs'] > 2])
        total_nans = df['Amount of NaNs'].sum()
        total_nans_percentage = (total_nans / total_samples)*100
        mean = df['Amount of NaNs'].mean()
        median = df['Amount of NaNs'].median()
        std = df['Amount of NaNs'].std()
        
        # Try to get relevant values for the top 3 of NaN streaks
        # Problem with this is that often there are multiple columns which have the same NaN streak...
        try: 
            first_highest_p = (df['Amount of NaNs'][0]/ total_samples)*100
            first_highest_column = df['Column name'][0]
        except:
            #print('There is no 1st highest')
            first_highest_p = np.NaN
            first_highest_column = np.NaN
        
        try:
            second_highest_p = (df['Amount of NaNs'][1]/ total_samples)*100
            second_highest_column = df['Column name'][1]
        except:
            #print('There is no 2nd highest')
            second_highest_p = np.NaN
            second_highest_column = np.NaN
        
        try:
            third_highest_p = (df['Amount of NaNs'][3]/ total_samples)*100
            third_highest_column = df['Column name'][3]
        except:
            #print('There is no 3rd highest')
            third_highest_p = np.NaN
            third_highest_column = np.NaN
        
    
        # Put the results in a list
        result = [dwelling_id, recorded_days, start_date, stop_date, (total_samples/1000), total_gaps, total_gaps_larger_than_2, total_nans, total_nans_percentage, mean, median, 
                  std, first_highest_p, first_highest_column, second_highest_p, second_highest_column, third_highest_p, third_highest_column]
        
        return result

# Main

In [3]:
def main():
    nan_dir = '//datc//opschaler//nan_information//'
    paths_h = glob.glob(nan_dir+'*_hour.csv')
    ids_h = list(map(lambda x: x[-20:-9], paths_h))
    
    paths_s = glob.glob(nan_dir+'*_10s.csv')
    ids_s = list(map(lambda x: x[-20:-8], paths_s))
    
    results_h = []
    results_10s = []
    
    headers=['dwelling id', 'recorded days', 'start date', 'stop date', 'total samples (per column, in thousands)','total NaN streaks', 'total NaN streaks > 2','total NaNs [-]', 'total NaNs [%]', 'mean of NaNs', 'median of NaNs', 'std of NaNs', 
             'first highest NaN streak (%)', 'first highest NaN streak column', 
             'second highest NaN streak (%)', 'second highest NaN streak column', 
             'third highest NaN streak (%)', 'third highest NaN streak column']
    
    for i, path in enumerate(tqdm(paths_h)):
        dwelling_id = ids_h[i]
        type_ = 'hour'
        results_h.append(nan_information_extractor(dwelling_id, path, type_))
    
    for i, path in enumerate(tqdm(paths_s)):
        dwelling_id = ids_s[i]
        type_ = '10s'
        results_10s.append(nan_information_extractor(dwelling_id, path, type_))
        
    # make df from list of lists, round all values within to 1 decimal.
    df_hour = pd.DataFrame.from_records(results_h, columns=headers).round(decimals=5) 
    df_10s = pd.DataFrame.from_records(results_10s, columns=headers).round(decimals=5) 
    
    # round some numbers differently
    one_decimal = ['total samples (per column, in thousands)', 'total NaNs [%]', 'mean of NaNs', 'std of NaNs']
    df_hour[one_decimal] = df_hour[one_decimal].round(decimals=1) 
    df_10s[one_decimal] = df_10s[one_decimal].round(decimals=1) 
    
    
    
    # sort by recoded days, highest to lowest
    df_hour = df_hour.sort_values(by=['recorded days'], ascending=False)
    df_10s = df_10s.sort_values(by=['recorded days'], ascending=False)
    
    return df_10s, df_hour

# Run main() and save the result

In [4]:
info_10s, info_hour = main() # This takes ~2,5 minutes

100%|██████████| 56/56 [00:00<00:00, 83.08it/s]
100%|██████████| 56/56 [02:24<00:00,  3.00s/it]


In [5]:
info_hour.to_csv('//datc//opschaler//dwelling_information//total_information//total_nan_information_hour.csv', sep='\t', index=False)
info_10s.to_csv('//datc//opschaler//dwelling_information//total_information//nan_information_10s.csv', sep='\t', index=False)

# Also save to Excel
writer = pd.ExcelWriter('//datc//opschaler//dwelling_information//total_information//total_nan_information.xlsx')
info_hour.to_excel(writer,'Hour dataframes', index=False)
info_10s.to_excel(writer,'10s dataframes', index=False)
writer.save()

print('FINISHED')

FINISHED
